942 files changed, 57200 insertions, 32749 deletions
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 903461b..e874d76 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1190,6 +1190,8 @@ if (LLVM_HAVE_TFLITE)
   find_package(tensorflow-lite REQUIRED)
 endif()
 
+set(LLVM_ENABLE_PROFCHECK OFF CACHE BOOL "Enable profile checking in test tools")
+
 # For up-to-date instructions for installing the Tensorflow dependency, refer to
 # the bot setup script: https://github.com/google/ml-compiler-opt/blob/main/buildbot/buildbot_init.sh
 # Specifically, assuming python3 is installed:
diff --git a/llvm/cmake/modules/LLVMProcessSources.cmake b/llvm/cmake/modules/LLVMProcessSources.cmake
index 0670d60..a7f9517 100644
--- a/llvm/cmake/modules/LLVMProcessSources.cmake
+++ b/llvm/cmake/modules/LLVMProcessSources.cmake
@@ -58,6 +58,15 @@ function(llvm_process_sources OUT_VAR)
   set(sources ${ARG_UNPARSED_ARGUMENTS})
   llvm_check_source_file_list(${sources})
 
+  foreach(fn ${sources})
+    get_filename_component(suf ${fn} EXT)
+    if("${suf}" STREQUAL ".cpp" OR "${suf}" STREQUAL ".c")
+      get_filename_component(short_name ${fn} NAME)
+      set_source_files_properties(${fn} PROPERTIES COMPILE_DEFINITIONS "__SHORT_FILE__=\"${short_name}\"")
+    endif()
+  endforeach()
+
+
   # This adds .td and .h files to the Visual Studio solution:
   add_td_sources(sources)
   find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index d13f95b..c3d4833 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -768,6 +768,9 @@ For example:
                                                   performant than code generated for XNACK replay
                                                   disabled.
 
+     cu-stores       TODO                         On GFX12.5, controls whether ``scope:SCOPE_CU`` stores may be used.
+                                                  If disabled, all stores will be done at ``scope:SCOPE_SE`` or greater.
+
      =============== ============================ ==================================================
 
 .. _amdgpu-target-id:
@@ -5107,7 +5110,9 @@ The fields used by CP for code objects before V3 also match those specified in
                                                      and must be 0,
      >454    1 bit   ENABLE_SGPR_PRIVATE_SEGMENT
                      _SIZE
-     457:455 3 bits                                  Reserved, must be 0.
+     455     1 bit   USES_CU_STORES                  GFX12.5: Whether the ``cu-stores`` target attribute is enabled.
+                                                     If 0, then all stores are ``SCOPE_SE`` or higher.
+     457:456 2 bits                                  Reserved, must be 0.
      458     1 bit   ENABLE_WAVEFRONT_SIZE32         GFX6-GFX9
                                                        Reserved, must be 0.
                                                      GFX10-GFX11
@@ -18188,6 +18193,8 @@ terminated by an ``.end_amdhsa_kernel`` directive.
                                                                                   GFX942)
      ``.amdhsa_user_sgpr_private_segment_size``               0                   GFX6-GFX12   Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in
                                                                                                :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
+     ``.amdhsa_uses_cu_stores``                               0                   GFX12.5      Controls USES_CU_STORES in
+                                                                                               :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
      ``.amdhsa_wavefront_size32``                             Target              GFX10-GFX12  Controls ENABLE_WAVEFRONT_SIZE32 in
                                                               Feature                          :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
                                                               Specific
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index 732227b..2dc3d77 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -1594,20 +1594,25 @@ Restrict Visibility
 ^^^^^^^^^^^^^^^^^^^
 
 Functions and variables should have the most restricted visibility possible.
+
 For class members, that means using appropriate ``private``, ``protected``, or
-``public`` keyword to restrict their access. For non-member functions, variables,
-and classes, that means restricting visibility to a single ``.cpp`` file if it's
-not referenced outside that file.
+``public`` keyword to restrict their access.
+
+For non-member functions, variables, and classes, that means restricting
+visibility to a single ``.cpp`` file if it is not referenced outside that file.
 
 Visibility of file-scope non-member variables and functions can be restricted to
 the current translation unit by using either the ``static`` keyword or an anonymous
-namespace. Anonymous namespaces are a great language feature that tells the C++
+namespace.
+
+Anonymous namespaces are a great language feature that tells the C++
 compiler that the contents of the namespace are only visible within the current
 translation unit, allowing more aggressive optimization and eliminating the
-possibility of symbol name collisions.  Anonymous namespaces are to C++ as
-``static`` is to C functions and global variables.  While ``static`` is available
-in C++, anonymous namespaces are more general: they can make entire classes
-private to a file.
+possibility of symbol name collisions.
+
+Anonymous namespaces are to C++ as ``static`` is to C functions and global
+variables.  While ``static`` is available in C++, anonymous namespaces are more
+general: they can make entire classes private to a file.
 
 The problem with anonymous namespaces is that they naturally want to encourage
 indentation of their body, and they reduce locality of reference: if you see a
@@ -1653,10 +1658,17 @@ Avoid putting declarations other than classes into anonymous namespaces:
 
   } // namespace
 
-When you are looking at "``runHelper``" in the middle of a large C++ file,
-you have no immediate way to tell if this function is local to the file.  In
-contrast, when the function is marked static, you don't need to cross-reference
-faraway places in the file to tell that the function is local.
+When you are looking at ``runHelper`` in the middle of a large C++ file,
+you have no immediate way to tell if this function is local to the file.
+
+In contrast, when the function is marked static, you don't need to cross-reference
+faraway places in the file to tell that the function is local:
+
+.. code-block:: c++
+
+  static void runHelper() {
+    ...
+  }
 
 Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 3036dae..e4dbb64b 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -240,8 +240,10 @@ Linux              x86\ :sup:`1`         GCC, Clang
 Linux              amd64                 GCC, Clang
 Linux              ARM                   GCC, Clang
 Linux              AArch64               GCC, Clang
+Linux              LoongArch             GCC, Clang
 Linux              Mips                  GCC, Clang
 Linux              PowerPC               GCC, Clang
+Linux              RISC-V                GCC, Clang
 Linux              SystemZ               GCC, Clang
 Solaris            V9 (Ultrasparc)       GCC
 DragonFlyBSD       amd64                 GCC, Clang
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index bac13cc..eb2ef6b 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -26653,9 +26653,10 @@ object's lifetime.
 Arguments:
 """"""""""
 
-The first argument is a constant integer representing the size of the
-object, or -1 if it is variable sized. The second argument is a pointer
-to an ``alloca`` instruction.
+The first argument is a constant integer, which is ignored and will be removed
+in the future.
+
+The second argument is a pointer to an ``alloca`` instruction.
 
 Semantics:
 """"""""""
@@ -26693,9 +26694,10 @@ The '``llvm.lifetime.end``' intrinsic specifies the end of a
 Arguments:
 """"""""""
 
-The first argument is a constant integer representing the size of the
-object, or -1 if it is variable sized. The second argument is a pointer
-to an ``alloca`` instruction.
+The first argument is a constant integer, which is ignored and will be removed
+in the future.
+
+The second argument is a pointer to an ``alloca`` instruction.
 
 Semantics:
 """"""""""
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index 68490c8..9ddeebd 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -932,7 +932,7 @@ In some contexts, certain types of errors are known to be benign. For example,
 when walking an archive, some clients may be happy to skip over badly formatted
 object files rather than terminating the walk immediately. Skipping badly
 formatted objects could be achieved using an elaborate handler method, but the
-Error.h header provides two utilities that make this idiom much cleaner: the
+``Error.h`` header provides two utilities that make this idiom much cleaner: the
 type inspection method, ``isA``, and the ``consumeError`` function:
 
 .. code-block:: c++
@@ -1073,7 +1073,7 @@ relatively natural use of C++ iterator/loop idioms.
 .. _function_apis:
 
 More information on Error and its related utilities can be found in the
-Error.h header file.
+``Error.h`` header file.
 
 Passing functions and other callable objects
 --------------------------------------------
@@ -1224,7 +1224,7 @@ Then you can run your pass like this:
 
 Of course, in practice, you should only set ``DEBUG_TYPE`` at the top of a file,
 to specify the debug type for the entire module. Be careful that you only do
-this after including Debug.h and not around any #include of headers. Also, you
+this after including ``Debug.h`` and not around any #include of headers. Also, you
 should use names more meaningful than "foo" and "bar", because there is no
 system in place to ensure that names do not conflict. If two different modules
 use the same string, they will all be turned on when the name is specified.
@@ -1579,18 +1579,18 @@ llvm/ADT/SmallVector.h
 ``SmallVector<Type, N>`` is a simple class that looks and smells just like
 ``vector<Type>``: it supports efficient iteration, lays out elements in memory
 order (so you can do pointer arithmetic between elements), supports efficient
-push_back/pop_back operations, supports efficient random access to its elements,
+``push_back``/``pop_back`` operations, supports efficient random access to its elements,
 etc.
 
-The main advantage of SmallVector is that it allocates space for some number of
-elements (N) **in the object itself**.  Because of this, if the SmallVector is
+The main advantage of ``SmallVector`` is that it allocates space for some number of
+elements (N) **in the object itself**.  Because of this, if the ``SmallVector`` is
 dynamically smaller than N, no malloc is performed.  This can be a big win in
 cases where the malloc/free call is far more expensive than the code that
 fiddles around with the elements.
 
 This is good for vectors that are "usually small" (e.g. the number of
 predecessors/successors of a block is usually less than 8).  On the other hand,
-this makes the size of the SmallVector itself large, so you don't want to
+this makes the size of the ``SmallVector`` itself large, so you don't want to
 allocate lots of them (doing so will waste a lot of space).  As such,
 SmallVectors are most useful when on the stack.
 
@@ -1600,21 +1600,21 @@ omitting the ``N``). This will choose a default number of
 inlined elements reasonable for allocation on the stack (for example, trying
 to keep ``sizeof(SmallVector<T>)`` around 64 bytes).
 
-SmallVector also provides a nice portable and efficient replacement for
+``SmallVector`` also provides a nice portable and efficient replacement for
 ``alloca``.
 
-SmallVector has grown a few other minor advantages over std::vector, causing
+``SmallVector`` has grown a few other minor advantages over ``std::vector``, causing
 ``SmallVector<Type, 0>`` to be preferred over ``std::vector<Type>``.
 
-#. std::vector is exception-safe, and some implementations have pessimizations
-   that copy elements when SmallVector would move them.
+#. ``std::vector`` is exception-safe, and some implementations have pessimizations
+   that copy elements when ``SmallVector`` would move them.
 
-#. SmallVector understands ``std::is_trivially_copyable<Type>`` and uses realloc aggressively.
+#. ``SmallVector`` understands ``std::is_trivially_copyable<Type>`` and uses realloc aggressively.
 
-#. Many LLVM APIs take a SmallVectorImpl as an out parameter (see the note
+#. Many LLVM APIs take a ``SmallVectorImpl`` as an out parameter (see the note
    below).
 
-#. SmallVector with N equal to 0 is smaller than std::vector on 64-bit
+#. ``SmallVector`` with N equal to 0 is smaller than ``std::vector`` on 64-bit
    platforms, since it uses ``unsigned`` (instead of ``void*``) for its size
    and capacity.
 
@@ -1698,11 +1698,11 @@ non-ordered manner.
 ^^^^^^^^
 
 ``std::vector<T>`` is well loved and respected.  However, ``SmallVector<T, 0>``
-is often a better option due to the advantages listed above.  std::vector is
+is often a better option due to the advantages listed above.  ``std::vector`` is
 still useful when you need to store more than ``UINT32_MAX`` elements or when
 interfacing with code that expects vectors :).
 
-One worthwhile note about std::vector: avoid code like this:
+One worthwhile note about ``std::vector``: avoid code like this:
 
 .. code-block:: c++
 
@@ -1749,10 +1749,10 @@ extremely high constant factor, particularly for small data types.
 ``std::list`` also only supports bidirectional iteration, not random access
 iteration.
 
-In exchange for this high cost, std::list supports efficient access to both ends
+In exchange for this high cost, ``std::list`` supports efficient access to both ends
 of the list (like ``std::deque``, but unlike ``std::vector`` or
 ``SmallVector``).  In addition, the iterator invalidation characteristics of
-std::list are stronger than that of a vector class: inserting or removing an
+``std::list`` are stronger than that of a vector class: inserting or removing an
 element into the list does not invalidate iterator or pointers to other elements
 in the list.
 
@@ -1895,7 +1895,7 @@ Note that it is generally preferred to *not* pass strings around as ``const
 char*``'s.  These have a number of problems, including the fact that they
 cannot represent embedded nul ("\0") characters, and do not have a length
 available efficiently.  The general replacement for '``const char*``' is
-StringRef.
+``StringRef``.
 
 For more information on choosing string containers for APIs, please see
 :ref:`Passing Strings <string_apis>`.
@@ -1905,41 +1905,41 @@ For more information on choosing string containers for APIs, please see
 llvm/ADT/StringRef.h
 ^^^^^^^^^^^^^^^^^^^^
 
-The StringRef class is a simple value class that contains a pointer to a
+The ``StringRef`` class is a simple value class that contains a pointer to a
 character and a length, and is quite related to the :ref:`ArrayRef
 <dss_arrayref>` class (but specialized for arrays of characters).  Because
-StringRef carries a length with it, it safely handles strings with embedded nul
+``StringRef`` carries a length with it, it safely handles strings with embedded nul
 characters in it, getting the length does not require a strlen call, and it even
 has very convenient APIs for slicing and dicing the character range that it
 represents.
 
-StringRef is ideal for passing simple strings around that are known to be live,
-either because they are C string literals, std::string, a C array, or a
-SmallVector.  Each of these cases has an efficient implicit conversion to
-StringRef, which doesn't result in a dynamic strlen being executed.
+``StringRef`` is ideal for passing simple strings around that are known to be live,
+either because they are C string literals, ``std::string``, a C array, or a
+``SmallVector``.  Each of these cases has an efficient implicit conversion to
+``StringRef``, which doesn't result in a dynamic ``strlen`` being executed.
 
-StringRef has a few major limitations which make more powerful string containers
+``StringRef`` has a few major limitations which make more powerful string containers
 useful:
 
-#. You cannot directly convert a StringRef to a 'const char*' because there is
-   no way to add a trailing nul (unlike the .c_str() method on various stronger
+#. You cannot directly convert a ``StringRef`` to a 'const char*' because there is
+   no way to add a trailing nul (unlike the ``.c_str()`` method on various stronger
    classes).
 
-#. StringRef doesn't own or keep alive the underlying string bytes.
+#. ``StringRef`` doesn't own or keep alive the underlying string bytes.
    As such it can easily lead to dangling pointers, and is not suitable for
-   embedding in datastructures in most cases (instead, use an std::string or
+   embedding in datastructures in most cases (instead, use an ``std::string`` or
    something like that).
 
-#. For the same reason, StringRef cannot be used as the return value of a
-   method if the method "computes" the result string.  Instead, use std::string.
+#. For the same reason, ``StringRef`` cannot be used as the return value of a
+   method if the method "computes" the result string.  Instead, use ``std::string``.
 
-#. StringRef's do not allow you to mutate the pointed-to string bytes and it
+#. ``StringRef``'s do not allow you to mutate the pointed-to string bytes and it
    doesn't allow you to insert or remove bytes from the range.  For editing
    operations like this, it interoperates with the :ref:`Twine <dss_twine>`
    class.
 
 Because of its strengths and limitations, it is very common for a function to
-take a StringRef and for a method on an object to return a StringRef that points
+take a ``StringRef`` and for a method on an object to return a ``StringRef`` that points
 into some string that it owns.
 
 .. _dss_twine:
@@ -1979,25 +1979,25 @@ behavior and will probably crash:
   const Twine &Tmp = X + "." + Twine(i);
   foo(Tmp);
 
-... because the temporaries are destroyed before the call.  That said, Twine's
-are much more efficient than intermediate std::string temporaries, and they work
-really well with StringRef.  Just be aware of their limitations.
+... because the temporaries are destroyed before the call.  That said, ``Twine``'s
+are much more efficient than intermediate ``std::string`` temporaries, and they work
+really well with ``StringRef``.  Just be aware of their limitations.
 
 .. _dss_smallstring:
 
 llvm/ADT/SmallString.h
 ^^^^^^^^^^^^^^^^^^^^^^
 
-SmallString is a subclass of :ref:`SmallVector <dss_smallvector>` that adds some
-convenience APIs like += that takes StringRef's.  SmallString avoids allocating
+``SmallString`` is a subclass of :ref:`SmallVector <dss_smallvector>` that adds some
+convenience APIs like += that takes ``StringRef``'s.  ``SmallString`` avoids allocating
 memory in the case when the preallocated space is enough to hold its data, and
 it calls back to general heap allocation when required.  Since it owns its data,
 it is very safe to use and supports full mutation of the string.
 
-Like SmallVector's, the big downside to SmallString is their sizeof.  While they
+Like ``SmallVector``'s, the big downside to ``SmallString`` is their sizeof.  While they
 are optimized for small strings, they themselves are not particularly small.
 This means that they work great for temporary scratch buffers on the stack, but
-should not generally be put into the heap: it is very rare to see a SmallString
+should not generally be put into the heap: it is very rare to see a ``SmallString``
 as the member of a frequently-allocated heap data structure or returned
 by-value.
 
@@ -2006,18 +2006,18 @@ by-value.
 std::string
 ^^^^^^^^^^^
 
-The standard C++ std::string class is a very general class that (like
-SmallString) owns its underlying data.  sizeof(std::string) is very reasonable
+The standard C++ ``std::string`` class is a very general class that (like
+``SmallString``) owns its underlying data.  sizeof(std::string) is very reasonable
 so it can be embedded into heap data structures and returned by-value.  On the
-other hand, std::string is highly inefficient for inline editing (e.g.
+other hand, ``std::string`` is highly inefficient for inline editing (e.g.
 concatenating a bunch of stuff together) and because it is provided by the
 standard library, its performance characteristics depend a lot of the host
 standard library (e.g. libc++ and MSVC provide a highly optimized string class,
 GCC contains a really slow implementation).
 
-The major disadvantage of std::string is that almost every operation that makes
+The major disadvantage of ``std::string`` is that almost every operation that makes
 them larger can allocate memory, which is slow.  As such, it is better to use
-SmallVector or Twine as a scratch buffer, but then use std::string to persist
+``SmallVector`` or ``Twine`` as a scratch buffer, but then use ``std::string`` to persist
 the result.
 
 .. _ds_set:
@@ -2035,8 +2035,8 @@ A sorted 'vector'
 ^^^^^^^^^^^^^^^^^
 
 If you intend to insert a lot of elements, then do a lot of queries, a great
-approach is to use an std::vector (or other sequential container) with
-std::sort+std::unique to remove duplicates.  This approach works really well if
+approach is to use an ``std::vector`` (or other sequential container) with
+``std::sort``+``std::unique`` to remove duplicates.  This approach works really well if
 your usage pattern has these two distinct phases (insert then query), and can be
 coupled with a good choice of :ref:`sequential container <ds_sequential>`.
 
@@ -2102,11 +2102,11 @@ copy-construction, which :ref:`SmallSet <dss_smallset>` and :ref:`SmallPtrSet
 llvm/ADT/DenseSet.h
 ^^^^^^^^^^^^^^^^^^^
 
-DenseSet is a simple quadratically probed hash table.  It excels at supporting
+``DenseSet`` is a simple quadratically probed hash table.  It excels at supporting
 small values: it uses a single allocation to hold all of the pairs that are
-currently inserted in the set.  DenseSet is a great way to unique small values
+currently inserted in the set.  ``DenseSet`` is a great way to unique small values
 that are not simple pointers (use :ref:`SmallPtrSet <dss_smallptrset>` for
-pointers).  Note that DenseSet has the same requirements for the value type that
+pointers).  Note that ``DenseSet`` has the same requirements for the value type that
 :ref:`DenseMap <dss_densemap>` has.
 
 .. _dss_sparseset:
@@ -2128,12 +2128,12 @@ data structures.
 llvm/ADT/SparseMultiSet.h
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-SparseMultiSet adds multiset behavior to SparseSet, while retaining SparseSet's
-desirable attributes. Like SparseSet, it typically uses a lot of memory, but
+``SparseMultiSet`` adds multiset behavior to ``SparseSet``, while retaining ``SparseSet``'s
+desirable attributes. Like ``SparseSet``, it typically uses a lot of memory, but
 provides operations that are almost as fast as a vector.  Typical keys are
 physical registers, virtual registers, or numbered basic blocks.
 
-SparseMultiSet is useful for algorithms that need very fast
+``SparseMultiSet`` is useful for algorithms that need very fast
 clear/find/insert/erase of the entire collection, and iteration over sets of
 elements sharing a key. It is often a more efficient choice than using composite
 data structures (e.g. vector-of-vectors, map-of-vectors). It is not intended for
@@ -2144,10 +2144,10 @@ building composite data structures.
 llvm/ADT/FoldingSet.h
 ^^^^^^^^^^^^^^^^^^^^^
 
-FoldingSet is an aggregate class that is really good at uniquing
+``FoldingSet`` is an aggregate class that is really good at uniquing
 expensive-to-create or polymorphic objects.  It is a combination of a chained
 hash table with intrusive links (uniqued objects are required to inherit from
-FoldingSetNode) that uses :ref:`SmallVector <dss_smallvector>` as part of its ID
+``FoldingSetNode``) that uses :ref:`SmallVector <dss_smallvector>` as part of its ID
 process.
 
 Consider a case where you want to implement a "getOrCreateFoo" method for a
@@ -2157,14 +2157,14 @@ operands), but we don't want to 'new' a node, then try inserting it into a set
 only to find out it already exists, at which point we would have to delete it
 and return the node that already exists.
 
-To support this style of client, FoldingSet perform a query with a
-FoldingSetNodeID (which wraps SmallVector) that can be used to describe the
+To support this style of client, ``FoldingSet`` perform a query with a
+``FoldingSetNodeID`` (which wraps ``SmallVector``) that can be used to describe the
 element that we want to query for.  The query either returns the element
 matching the ID or it returns an opaque ID that indicates where insertion should
 take place.  Construction of the ID usually does not require heap traffic.
 
-Because FoldingSet uses intrusive links, it can support polymorphic objects in
-the set (for example, you can have SDNode instances mixed with LoadSDNodes).
+Because ``FoldingSet`` uses intrusive links, it can support polymorphic objects in
+the set (for example, you can have ``SDNode`` instances mixed with ``LoadSDNodes``).
 Because the elements are individually allocated, pointers to the elements are
 stable: inserting or removing elements does not invalidate any pointers to other
 elements.
@@ -2175,7 +2175,7 @@ elements.
 ^^^^^
 
 ``std::set`` is a reasonable all-around set class, which is decent at many
-things but great at nothing.  std::set allocates memory for each element
+things but great at nothing.  ``std::set`` allocates memory for each element
 inserted (thus it is very malloc intensive) and typically stores three pointers
 per element in the set (thus adding a large amount of per-element space
 overhead).  It offers guaranteed log(n) performance, which is not particularly
@@ -2183,12 +2183,12 @@ fast from a complexity standpoint (particularly if the elements of the set are
 expensive to compare, like strings), and has extremely high constant factors for
 lookup, insertion and removal.
 
-The advantages of std::set are that its iterators are stable (deleting or
+The advantages of ``std::set`` are that its iterators are stable (deleting or
 inserting an element from the set does not affect iterators or pointers to other
 elements) and that iteration over the set is guaranteed to be in sorted order.
 If the elements in the set are large, then the relative overhead of the pointers
 and malloc traffic is not a big deal, but if the elements of the set are small,
-std::set is almost never a good choice.
+``std::set`` is almost never a good choice.
 
 .. _dss_setvector:
 
@@ -2242,11 +2242,11 @@ produces a lot of malloc traffic.  It should be avoided.
 llvm/ADT/ImmutableSet.h
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-ImmutableSet is an immutable (functional) set implementation based on an AVL
+``ImmutableSet`` is an immutable (functional) set implementation based on an AVL
 tree.  Adding or removing elements is done through a Factory object and results
-in the creation of a new ImmutableSet object.  If an ImmutableSet already exists
+in the creation of a new ``ImmutableSet`` object.  If an ``ImmutableSet`` already exists
 with the given contents, then the existing one is returned; equality is compared
-with a FoldingSetNodeID.  The time and space complexity of add or remove
+with a ``FoldingSetNodeID``.  The time and space complexity of add or remove
 operations is logarithmic in the size of the original set.
 
 There is no method for returning an element of the set, you can only check for
@@ -2257,11 +2257,11 @@ membership.
 Other Set-Like Container Options
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The STL provides several other options, such as std::multiset and
-std::unordered_set.  We never use containers like unordered_set because
+The STL provides several other options, such as ``std::multiset`` and
+``std::unordered_set``.  We never use containers like ``unordered_set`` because
 they are generally very expensive (each insertion requires a malloc).
 
-std::multiset is useful if you're not interested in elimination of duplicates,
+``std::multiset`` is useful if you're not interested in elimination of duplicates,
 but has all the drawbacks of :ref:`std::set <dss_set>`.  A sorted vector
 (where you don't delete duplicate entries) or some other approach is almost
 always better.
@@ -2282,7 +2282,7 @@ A sorted 'vector'
 If your usage pattern follows a strict insert-then-query approach, you can
 trivially use the same approach as :ref:`sorted vectors for set-like containers
 <dss_sortedvectorset>`.  The only difference is that your query function (which
-uses std::lower_bound to get efficient log(n) lookup) should only compare the
+uses ``std::lower_bound`` to get efficient log(n) lookup) should only compare the
 key, not both the key and value.  This yields the same advantages as sorted
 vectors for sets.
 
@@ -2293,11 +2293,11 @@ llvm/ADT/StringMap.h
 
 Strings are commonly used as keys in maps, and they are difficult to support
 efficiently: they are variable length, inefficient to hash and compare when
-long, expensive to copy, etc.  StringMap is a specialized container designed to
+long, expensive to copy, etc.  ``StringMap`` is a specialized container designed to
 cope with these issues.  It supports mapping an arbitrary range of bytes to an
 arbitrary other object.
 
-The StringMap implementation uses a quadratically-probed hash table, where the
+The ``StringMap`` implementation uses a quadratically-probed hash table, where the
 buckets store a pointer to the heap allocated entries (and some other stuff).
 The entries in the map must be heap allocated because the strings are variable
 length.  The string data (key) and the element object (value) are stored in the
@@ -2305,26 +2305,26 @@ same allocation with the string data immediately after the element object.
 This container guarantees the "``(char*)(&Value+1)``" points to the key string
 for a value.
 
-The StringMap is very fast for several reasons: quadratic probing is very cache
+The ``StringMap`` is very fast for several reasons: quadratic probing is very cache
 efficient for lookups, the hash value of strings in buckets is not recomputed
-when looking up an element, StringMap rarely has to touch the memory for
+when looking up an element, ``StringMap`` rarely has to touch the memory for
 unrelated objects when looking up a value (even when hash collisions happen),
 hash table growth does not recompute the hash values for strings already in the
 table, and each pair in the map is store in a single allocation (the string data
 is stored in the same allocation as the Value of a pair).
 
-StringMap also provides query methods that take byte ranges, so it only ever
+``StringMap`` also provides query methods that take byte ranges, so it only ever
 copies a string if a value is inserted into the table.
 
-StringMap iteration order, however, is not guaranteed to be deterministic, so
-any uses which require that should instead use a std::map.
+``StringMap`` iteration order, however, is not guaranteed to be deterministic, so
+any uses which require that should instead use a ``std::map``.
 
 .. _dss_indexmap:
 
 llvm/ADT/IndexedMap.h
 ^^^^^^^^^^^^^^^^^^^^^
 
-IndexedMap is a specialized container for mapping small dense integers (or
+``IndexedMap`` is a specialized container for mapping small dense integers (or
 values that can be mapped to small dense integers) to some other type.  It is
 internally implemented as a vector with a mapping function that maps the keys
 to the dense integer range.
@@ -2338,27 +2338,27 @@ virtual register ID).
 llvm/ADT/DenseMap.h
 ^^^^^^^^^^^^^^^^^^^
 
-DenseMap is a simple quadratically probed hash table.  It excels at supporting
+``DenseMap`` is a simple quadratically probed hash table.  It excels at supporting
 small keys and values: it uses a single allocation to hold all of the pairs
-that are currently inserted in the map.  DenseMap is a great way to map
+that are currently inserted in the map.  ``DenseMap`` is a great way to map
 pointers to pointers, or map other small types to each other.
 
-There are several aspects of DenseMap that you should be aware of, however.
-The iterators in a DenseMap are invalidated whenever an insertion occurs,
-unlike map.  Also, because DenseMap allocates space for a large number of
+There are several aspects of ``DenseMap`` that you should be aware of, however.
+The iterators in a ``DenseMap`` are invalidated whenever an insertion occurs,
+unlike ``map``.  Also, because ``DenseMap`` allocates space for a large number of
 key/value pairs (it starts with 64 by default), it will waste a lot of space if
 your keys or values are large.  Finally, you must implement a partial
-specialization of DenseMapInfo for the key that you want, if it isn't already
-supported.  This is required to tell DenseMap about two special marker values
+specialization of ``DenseMapInfo`` for the key that you want, if it isn't already
+supported.  This is required to tell ``DenseMap`` about two special marker values
 (which can never be inserted into the map) that it needs internally.
 
-DenseMap's find_as() method supports lookup operations using an alternate key
+``DenseMap``'s ``find_as()`` method supports lookup operations using an alternate key
 type.  This is useful in cases where the normal key type is expensive to
-construct, but cheap to compare against.  The DenseMapInfo is responsible for
+construct, but cheap to compare against.  The ``DenseMapInfo`` is responsible for
 defining the appropriate comparison and hashing methods for each alternate key
 type used.
 
-DenseMap.h also contains a SmallDenseMap variant, that similar to
+``DenseMap.h`` also contains a ``SmallDenseMap`` variant, that similar to
 :ref:`SmallVector <dss_smallvector>` performs no heap allocation until the
 number of elements in the template parameter N are exceeded.
 
@@ -2404,12 +2404,12 @@ further additions.
 <map>
 ^^^^^
 
-std::map has similar characteristics to :ref:`std::set <dss_set>`: it uses a
+``std::map`` has similar characteristics to :ref:`std::set <dss_set>`: it uses a
 single allocation per pair inserted into the map, it offers log(n) lookup with
 an extremely large constant factor, imposes a space penalty of 3 pointers per
 pair in the map, etc.
 
-std::map is most useful when your keys or values are very large, if you need to
+``std::map`` is most useful when your keys or values are very large, if you need to
 iterate over the collection in sorted order, or if you need stable iterators
 into the map (i.e. they don't get invalidated if an insertion or deletion of
 another element takes place).
@@ -2419,7 +2419,7 @@ another element takes place).
 llvm/ADT/MapVector.h
 ^^^^^^^^^^^^^^^^^^^^
 
-``MapVector<KeyT,ValueT>`` provides a subset of the DenseMap interface.  The
+``MapVector<KeyT,ValueT>`` provides a subset of the ``DenseMap`` interface.  The
 main difference is that the iteration order is guaranteed to be the insertion
 order, making it an easy (but somewhat expensive) solution for non-deterministic
 iteration over maps of pointers.
@@ -2463,12 +2463,12 @@ operations is logarithmic in the size of the original map.
 Other Map-Like Container Options
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The STL provides several other options, such as std::multimap and
-std::unordered_map.  We never use containers like unordered_map because
+The STL provides several other options, such as ``std::multimap`` and
+``std::unordered_map``.  We never use containers like ``unordered_map`` because
 they are generally very expensive (each insertion requires a malloc).
 
-std::multimap is useful if you want to map a key to multiple values, but has all
-the drawbacks of std::map.  A sorted vector or some other approach is almost
+``std::multimap`` is useful if you want to map a key to multiple values, but has all
+the drawbacks of ``std::map``.  A sorted vector or some other approach is almost
 always better.
 
 .. _ds_bit:
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 48d2ef1..021f321 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -68,6 +68,12 @@ Changes to TableGen
 Changes to Interprocedural Optimizations
 ----------------------------------------
 
+Changes to Vectorizers
+----------------------------------------
+
+* Added initial support for copyable elements in SLP, which models copyable
+  elements as add <element>, 0, i.e. uses identity constants for missing lanes.
+
 Changes to the AArch64 Backend
 ------------------------------
 
diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst
index 7b30698..2b1af05 100644
--- a/llvm/docs/TableGen/ProgRef.rst
+++ b/llvm/docs/TableGen/ProgRef.rst
@@ -219,17 +219,17 @@ TableGen provides "bang operators" that have a wide variety of uses:
 
 .. productionlist::
    BangOperator: one of
-               : !add         !and         !cast        !con         !dag
-               : !div         !empty       !eq          !exists      !filter
-               : !find        !foldl       !foreach     !ge          !getdagarg
-               : !getdagname  !getdagop    !gt          !head        !if
-               : !initialized !instances   !interleave  !isa         !le
-               : !listconcat  !listflatten !listremove  !listsplat   !logtwo
-               : !lt          !match       !mul         !ne          !not
-               : !or          !range       !repr        !setdagarg   !setdagname
-               : !setdagop    !shl         !size        !sra         !srl
-               : !strconcat   !sub         !subst       !substr      !tail
-               : !tolower     !toupper     !xor
+               : !add         !and         !cast         !con         !dag
+               : !div         !empty       !eq           !exists      !filter
+               : !find        !foldl       !foreach      !ge          !getdagarg
+               : !getdagname  !getdagop    !getdagopname !gt          !head
+               : !if          !initialized !instances    !interleave  !isa
+               : !le          !listconcat  !listflatten  !listremove  !listsplat
+               : !logtwo      !lt          !match        !mul         !ne
+               : !not         !or          !range        !repr        !setdagarg
+               : !setdagname  !setdagop    !setdagopname !shl         !size
+               : !sra         !srl         !strconcat    !sub         !subst
+               : !substr      !tail        !tolower      !toupper     !xor
 
 The ``!cond`` operator has a slightly different
 syntax compared to other bang operators, so it is defined separately:
@@ -1443,7 +1443,8 @@ DAG.
 
 The following bang operators are useful for working with DAGs:
 ``!con``, ``!dag``, ``!empty``, ``!foreach``, ``!getdagarg``, ``!getdagname``,
-``!getdagop``, ``!setdagarg``, ``!setdagname``, ``!setdagop``, ``!size``.
+``!getdagop``, ``!getdagopname``, ``!setdagarg``, ``!setdagname``, ``!setdagop``,
+``!setdagopname``, ``!size``.
 
 Defvar in a record body
 -----------------------
@@ -1695,9 +1696,11 @@ and non-0 as true.
     This operator concatenates the DAG nodes *a*, *b*, etc. Their operations
     must equal.
 
-    ``!con((op a1:$name1, a2:$name2), (op b1:$name3))``
+    ``!con((op:$lhs a1:$name1, a2:$name2), (op:$rhs b1:$name3))``
 
-    results in the DAG node ``(op a1:$name1, a2:$name2, b1:$name3)``.
+    results in the DAG node ``(op:$lhs a1:$name1, a2:$name2, b1:$name3)``.
+    The name of the dag operator is derived from the LHS DAG node if it is
+    set, otherwise from the RHS DAG node.
 
 ``!cond(``\ *cond1* ``:`` *val1*\ ``,`` *cond2* ``:`` *val2*\ ``, ...,`` *condn* ``:`` *valn*\ ``)``
     This operator tests *cond1* and returns *val1* if the result is true.
@@ -1819,6 +1822,10 @@ and non-0 as true.
 
       dag d = !dag(!getdagop(someDag), args, names);
 
+``!getdagopname(``\ *dag*\ ``)``
+    This operator retrieves the name of the given *dag* operator. If the operator
+    has no name associated, ``?`` is returned.
+
 ``!gt(``\ *a*\ `,` *b*\ ``)``
     This operator produces 1 if *a* is greater than *b*; 0 otherwise.
     The arguments must be ``bit``, ``bits``, ``int``, or ``string`` values.
@@ -1949,6 +1956,10 @@ and non-0 as true.
 
     Example: ``!setdagop((foo 1, 2), bar)`` results in ``(bar 1, 2)``.
 
+``!setdagopname(``\ *dag*\ ``,``\ *name*\ ``)``
+    This operator produces a DAG node with the same operator and arguments as
+    *dag*, but replacing the name of the operator with *name*.
+
 ``!shl(``\ *a*\ ``,`` *count*\ ``)``
     This operator shifts *a* left logically by *count* bits and produces the resulting
     value. The operation is performed on a 64-bit integer; the result
diff --git a/llvm/docs/YamlIO.rst b/llvm/docs/YamlIO.rst
index 420adb8..c5079d8 100644
--- a/llvm/docs/YamlIO.rst
+++ b/llvm/docs/YamlIO.rst
@@ -8,10 +8,10 @@ YAML I/O
 Introduction to YAML
 ====================
 
-YAML is a human readable data serialization language.  The full YAML language
+YAML is a human-readable data serialization language.  The full YAML language
 spec can be read at `yaml.org
 <http://www.yaml.org/spec/1.2/spec.html#Introduction>`_.  The simplest form of
-yaml is just "scalars", "mappings", and "sequences".  A scalar is any number
+YAML is just "scalars", "mappings", and "sequences".  A scalar is any number
 or string.  The pound/hash symbol (#) begins a comment line.   A mapping is
 a set of key-value pairs where the key ends with a colon.  For example:
 
@@ -49,10 +49,10 @@ of mappings in which one of the mapping values is itself a sequence:
         - PowerPC
         - x86
 
-Sometime sequences are known to be short and the one entry per line is too
-verbose, so YAML offers an alternate syntax for sequences called a "Flow
+Sometimes sequences are known to be short and the one entry per line is too
+verbose, so YAML offers an alternative syntax for sequences called a "Flow
 Sequence" in which you put comma separated sequence elements into square
-brackets.  The above example could then be simplified to :
+brackets.  The above example could then be simplified to:
 
 
 .. code-block:: yaml
@@ -78,21 +78,21 @@ YAML I/O assumes you have some "native" data structures which you want to be
 able to dump as YAML and recreate from YAML.  The first step is to try
 writing example YAML for your data structures. You may find after looking at
 possible YAML representations that a direct mapping of your data structures
-to YAML is not very readable.  Often the fields are not in the order that
+to YAML is not very readable.  Often, the fields are not in an order that
 a human would find readable.  Or the same information is replicated in multiple
 locations, making it hard for a human to write such YAML correctly.
 
 In relational database theory there is a design step called normalization in
 which you reorganize fields and tables.  The same considerations need to
 go into the design of your YAML encoding.  But, you may not want to change
-your existing native data structures.  Therefore, when writing out YAML
+your existing native data structures.  Therefore, when writing out YAML,
 there may be a normalization step, and when reading YAML there would be a
 corresponding denormalization step.
 
-YAML I/O uses a non-invasive, traits based design.  YAML I/O defines some
+YAML I/O uses a non-invasive, traits-based design.  YAML I/O defines some
 abstract base templates.  You specialize those templates on your data types.
-For instance, if you have an enumerated type FooBar you could specialize
-ScalarEnumerationTraits on that type and define the ``enumeration()`` method:
+For instance, if you have an enumerated type ``FooBar`` you could specialize
+``ScalarEnumerationTraits`` on that type and define the ``enumeration()`` method:
 
 .. code-block:: c++
 
@@ -107,12 +107,12 @@ ScalarEnumerationTraits on that type and define the ``enumeration()`` method:
     };
 
 
-As with all YAML I/O template specializations, the ScalarEnumerationTraits is used for
+As with all YAML I/O template specializations, the ``ScalarEnumerationTraits`` is used for
 both reading and writing YAML. That is, the mapping between in-memory enum
 values and the YAML string representation is only in one place.
 This assures that the code for writing and parsing of YAML stays in sync.
 
-To specify a YAML mappings, you define a specialization on
+To specify YAML mappings, you define a specialization on
 ``llvm::yaml::MappingTraits``.
 If your native data structure happens to be a struct that is already normalized,
 then the specialization is simple.  For example:
@@ -131,7 +131,7 @@ then the specialization is simple.  For example:
     };
 
 
-A YAML sequence is automatically inferred if you data type has ``begin()``/``end()``
+A YAML sequence is automatically inferred if your data type has ``begin()``/``end()``
 iterators and a ``push_back()`` method.  Therefore any of the STL containers
 (such as ``std::vector<>``) will automatically translate to YAML sequences.
 
@@ -197,7 +197,7 @@ Error Handling
 When parsing a YAML document, if the input does not match your schema (as
 expressed in your ``XxxTraits<>`` specializations).  YAML I/O
 will print out an error message and your Input object's ``error()`` method will
-return true. For instance the following document:
+return true. For instance, the following document:
 
 .. code-block:: yaml
 
@@ -244,8 +244,8 @@ The following types have built-in support in YAML I/O:
 * uint16_t
 * uint8_t
 
-That is, you can use those types in fields of MappingTraits or as element type
-in sequence.  When reading, YAML I/O will validate that the string found
+That is, you can use those types in fields of ``MappingTraits`` or as the element type
+in a sequence.  When reading, YAML I/O will validate that the string found
 is convertible to that type and error out if not.
 
 
@@ -255,7 +255,7 @@ Given that YAML I/O is trait based, the selection of how to convert your data
 to YAML is based on the type of your data.  But in C++ type matching, typedefs
 do not generate unique type names.  That means if you have two typedefs of
 unsigned int, to YAML I/O both types look exactly like unsigned int.  To
-facilitate make unique type names, YAML I/O provides a macro which is used
+facilitate making unique type names, YAML I/O provides a macro which is used
 like a typedef on built-in types, but expands to create a class with conversion
 operators to and from the base type.  For example:
 
@@ -271,7 +271,7 @@ is that you can now specify traits on them to get different YAML conversions.
 
 Hex types
 ---------
-An example use of a unique type is that YAML I/O provides fixed sized unsigned
+An example use of a unique type is that YAML I/O provides fixed-sized unsigned
 integers that are written with YAML I/O as hexadecimal instead of the decimal
 format used by the built-in integer types:
 
@@ -280,7 +280,7 @@ format used by the built-in integer types:
 * Hex16
 * Hex8
 
-You can use ``llvm::yaml::Hex32`` instead of ``uint32_t`` and the only different will
+You can use ``llvm::yaml::Hex32`` instead of ``uint32_t`` and the only difference will
 be that when YAML I/O writes out that type it will be formatted in hexadecimal.
 
 
@@ -288,7 +288,7 @@ ScalarEnumerationTraits
 -----------------------
 YAML I/O supports translating between in-memory enumerations and a set of string
 values in YAML documents. This is done by specializing ``ScalarEnumerationTraits<>``
-on your enumeration type and define an ``enumeration()`` method.
+on your enumeration type and defining an ``enumeration()`` method.
 For instance, suppose you had an enumeration of CPUs and a struct with it as
 a field:
 
@@ -306,7 +306,7 @@ a field:
     };
 
 To support reading and writing of this enumeration, you can define a
-ScalarEnumerationTraits specialization on CPUs, which can then be used
+``ScalarEnumerationTraits`` specialization on CPUs, which can then be used
 as a field type:
 
 .. code-block:: c++
@@ -356,7 +356,7 @@ had the following bit flags defined:
 
     LLVM_YAML_STRONG_TYPEDEF(uint32_t, MyFlags)
 
-To support reading and writing of MyFlags, you specialize ScalarBitSetTraits<>
+To support reading and writing of MyFlags, you specialize ``ScalarBitSetTraits<>``
 on MyFlags and provide the bit values and their names.
 
 .. code-block:: c++
@@ -399,7 +399,7 @@ the above schema, a same valid YAML document is:
     name:    Tom
     flags:   [ pointy, flat ]
 
-Sometimes a "flags" field might contains an enumeration part
+Sometimes a "flags" field might contain an enumeration part
 defined by a bit-mask.
 
 .. code-block:: c++
@@ -415,7 +415,7 @@ defined by a bit-mask.
       flagsCPU2 = 16
     };
 
-To support reading and writing such fields, you need to use the maskedBitSet()
+To support reading and writing such fields, you need to use the ``maskedBitSet()``
 method and provide the bit values, their names and the enumeration mask.
 
 .. code-block:: c++
@@ -438,8 +438,8 @@ to the flow sequence.
 
 Custom Scalar
 -------------
-Sometimes for readability a scalar needs to be formatted in a custom way. For
-instance your internal data structure may use an integer for time (seconds since
+Sometimes, for readability, a scalar needs to be formatted in a custom way. For
+instance, your internal data structure may use an integer for time (seconds since
 some epoch), but in YAML it would be much nicer to express that integer in
 some time format (e.g. 4-May-2012 10:30pm).  YAML I/O has a way to support
 custom formatting and parsing of scalar types by specializing ``ScalarTraits<>`` on
@@ -487,13 +487,13 @@ your data type. The library doesn't provide any built-in support for block
 scalar I/O for types like ``std::string`` and ``llvm::StringRef`` as they are already
 supported by YAML I/O and use the ordinary scalar notation by default.
 
-BlockScalarTraits specializations are very similar to the
-ScalarTraits specialization - YAML I/O will provide the native type and your
+``BlockScalarTraits`` specializations are very similar to the
+``ScalarTraits`` specialization - YAML I/O will provide the native type and your
 specialization must create a temporary ``llvm::StringRef`` when writing, and
 it will also provide an ``llvm::StringRef`` that has the value of that block scalar
 and your specialization must convert that to your native data type when reading.
 An example of a custom type with an appropriate specialization of
-BlockScalarTraits is shown below:
+``BlockScalarTraits`` is shown below:
 
 .. code-block:: c++
 
@@ -621,7 +621,7 @@ Polar which specifies a position in polar coordinates (distance, angle):
       float angle;
     };
 
-but you've decided the normalized YAML for should be in x,y coordinates. That
+but you've decided the normalized YAML form should be in x,y coordinates. That
 is, you want the yaml to look like:
 
 .. code-block:: yaml
@@ -762,19 +762,19 @@ tag on a map. Using this functionality it is even possible to support different
 mappings, as long as they are convertible.
 
 To check a tag, inside your ``mapping()`` method you can use ``io.mapTag()`` to specify
-what the tag should be.  This will also add that tag when writing yaml.
+what the tag should be.  This will also add that tag when writing YAML.
 
 Validation
 ----------
 
 Sometimes in a YAML map, each key/value pair is valid, but the combination is
 not.  This is similar to something having no syntax errors, but still having
-semantic errors.  To support semantic level checking, YAML I/O allows
+semantic errors.  To support semantic-level checking, YAML I/O allows
 an optional ``validate()`` method in a MappingTraits template specialization.
 
 When parsing YAML, the ``validate()`` method is call *after* all key/values in
 the map have been processed. Any error message returned by the ``validate()``
-method during input will be printed just a like a syntax error would be printed.
+method during input will be printed just like a syntax error would be printed.
 When writing YAML, the ``validate()`` method is called *before* the YAML
 key/values  are written.  Any error during output will trigger an ``assert()``
 because it is a programming error to have invalid struct values.
@@ -827,7 +827,7 @@ add "static const bool flow = true;". For instance:
       static const bool flow = true;
     }
 
-Flow mappings are subject to line wrapping according to the Output object
+Flow mappings are subject to line wrapping according to the ``Output`` object
 configuration.
 
 Sequence
@@ -850,7 +850,7 @@ The ``size()`` method returns how many elements are currently in your sequence.
 The ``element()`` method returns a reference to the i'th element in the sequence.
 When parsing YAML, the ``element()`` method may be called with an index one bigger
 than the current size.  Your ``element()`` method should allocate space for one
-more element (using default constructor if element is a C++ object) and returns
+more element (using default constructor if element is a C++ object) and return
 a reference to that new allocated space.
 
 
@@ -919,8 +919,8 @@ trait for you document list type.  The trait has the same methods as
 
 User Context Data
 =================
-When an ``llvm::yaml::Input`` or ``llvm::yaml::Output`` object is created their
-constructors take an optional "context" parameter.  This is a pointer to
+When an ``llvm::yaml::Input`` or ``llvm::yaml::Output`` object is created, its
+constructor takes an optional "context" parameter.  This is a pointer to
 whatever state information you might need.
 
 For instance, in a previous example we showed how the conversion type for a
@@ -930,10 +930,10 @@ of an outer mapping?  That is where the "context" parameter comes in. You
 can set values in the context in the outer map's ``mapping()`` method and
 retrieve those values in the inner map's ``mapping()`` method.
 
-The context value is just a void*.  All your traits which use the context
+The context value is just a ``void*``.  All your traits which use the context
 and operate on your native data types, need to agree what the context value
 actually is.  It could be a pointer to an object or struct which your various
-traits use to shared context sensitive information.
+traits use to share context sensitive information.
 
 
 Output
@@ -941,7 +941,7 @@ Output
 
 The ``llvm::yaml::Output`` class is used to generate a YAML document from your
 in-memory data structures, using traits defined on your data types.
-To instantiate an Output object you need an ``llvm::raw_ostream``, an optional
+To instantiate an ``Output`` object you need an ``llvm::raw_ostream``, an optional
 context pointer and an optional wrapping column:
 
 .. code-block:: c++
@@ -950,10 +950,10 @@ context pointer and an optional wrapping column:
       public:
         Output(llvm::raw_ostream &, void *context = NULL, int WrapColumn = 70);
 
-Once you have an Output object, you can use the C++ stream operator on it
+Once you have an ``Output`` object, you can use the C++ stream operator on it
 to write your native data as YAML. One thing to recall is that a YAML file
 can contain multiple "documents".  If the top level data structure you are
-streaming as YAML is a mapping, scalar, or sequence, then Output assumes you
+streaming as YAML is a mapping, scalar, or sequence, then ``Output`` assumes you
 are generating one document and wraps the mapping output
 with  "``---``" and trailing "``...``".
 
@@ -1009,7 +1009,7 @@ Input
 =====
 
 The ``llvm::yaml::Input`` class is used to parse YAML document(s) into your native
-data structures. To instantiate an Input
+data structures. To instantiate an ``Input``
 object you need a ``StringRef`` to the entire YAML file, and optionally a context
 pointer:
 
@@ -1019,12 +1019,12 @@ pointer:
       public:
         Input(StringRef inputContent, void *context=NULL);
 
-Once you have an Input object, you can use the C++ stream operator to read
+Once you have an ``Input`` object, you can use the C++ stream operator to read
 the document(s).  If you expect there might be multiple YAML documents in
-one file, you'll need to specialize DocumentListTraits on a list of your
+one file, you'll need to specialize ``DocumentListTraits`` on a list of your
 document type and stream in that document list type.  Otherwise you can
 just stream in the document type.  Also, you can check if there was
-any syntax errors in the YAML be calling the ``error()`` method on the Input
+any syntax errors in the YAML by calling the ``error()`` method on the ``Input``
 object.  For example:
 
 .. code-block:: c++
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index 9e2dc1a..956dcbc 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -359,6 +359,8 @@ public:
              std::tie(RHS.RecordID, RHS.Space, RHS.LowerBound, RHS.Size);
     }
     bool overlapsWith(const ResourceBinding &RHS) const {
+      if (Size == UINT32_MAX)
+        return LowerBound < RHS.LowerBound;
       return Space == RHS.Space && LowerBound + Size - 1 >= RHS.LowerBound;
     }
   };
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index d87457c..498c19b 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -153,6 +153,7 @@ class Vocabulary {
                     static_cast<unsigned>(OperandKind::MaxOperandKind),
                 "OperandKindNames array size must match MaxOperandKind");
 
+public:
   /// Vocabulary layout constants
 #define LAST_OTHER_INST(NUM) static constexpr unsigned MaxOpcodes = NUM;
 #include "llvm/IR/Instruction.def"
@@ -162,7 +163,6 @@ class Vocabulary {
   static constexpr unsigned MaxOperandKinds =
       static_cast<unsigned>(OperandKind::MaxOperandKind);
 
-public:
   Vocabulary() = default;
   Vocabulary(VocabVector &&Vocab);
 
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 09e3945..bff7707 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -160,6 +160,12 @@ m_scev_ZExt(const Op0_t &Op0) {
   return m_scev_Unary<SCEVZeroExtendExpr>(Op0);
 }
 
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>
+m_scev_PtrToInt(const Op0_t &Op0) {
+  return SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>(Op0);
+}
+
 /// Match a binary SCEV.
 template <typename SCEVTy, typename Op0_t, typename Op1_t>
 struct SCEVBinaryExpr_match {
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 9a2773c..b55c4e0 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -177,12 +177,6 @@ LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(
 LLVM_ABI Intrinsic::ID
 getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI);
 
-/// Returns the corresponding llvm.vector.interleaveN intrinsic for factor N.
-LLVM_ABI Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor);
-
-/// Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
-LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
-
 /// Returns the corresponding factor of llvm.vector.interleaveN intrinsics.
 LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID);
 
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index aefdb53..d500e94 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -133,6 +133,8 @@ LLVM_ABI bool getEnableStackSizeSection();
 
 LLVM_ABI bool getEnableAddrsig();
 
+LLVM_ABI bool getEnableCallGraphSection();
+
 LLVM_ABI bool getEmitCallSiteInfo();
 
 LLVM_ABI bool getEnableMachineFunctionSplitter();
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 119786f..0f3f945 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -482,6 +482,8 @@ struct CallSiteInfo {
 
   MachineInstrLoc CallLocation;
   std::vector<ArgRegPair> ArgForwardingRegs;
+  /// Numeric callee type identifiers for the callgraph section.
+  std::vector<uint64_t> CalleeTypeIds;
 
   bool operator==(const CallSiteInfo &Other) const {
     return CallLocation.BlockNum == Other.CallLocation.BlockNum &&
@@ -511,6 +513,7 @@ template <> struct MappingTraits<CallSiteInfo> {
     YamlIO.mapRequired("offset", CSInfo.CallLocation.Offset);
     YamlIO.mapOptional("fwdArgRegs", CSInfo.ArgForwardingRegs,
                        std::vector<CallSiteInfo::ArgRegPair>());
+    YamlIO.mapOptional("calleeTypeIds", CSInfo.CalleeTypeIds);
   }
 
   static const bool flow = true;
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index e5958ec..7f88323 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -515,6 +515,8 @@ public:
   struct CallSiteInfo {
     /// Vector of call argument and its forwarding register.
     SmallVector<ArgRegPair, 1> ArgRegPairs;
+    /// Callee type ids.
+    SmallVector<ConstantInt *, 4> CalleeTypeIds;
   };
 
   struct CalledGlobalInfo {
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index eac8e14..e5644a5 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1428,10 +1428,9 @@ public:
                                        EVT MemVT, MachineMemOperand *MMO);
 
   /// Creates a LifetimeSDNode that starts (`IsStart==true`) or ends
-  /// (`IsStart==false`) the lifetime of the portion of `FrameIndex` between
-  /// offsets `0` and `Size`.
+  /// (`IsStart==false`) the lifetime of the `FrameIndex`.
   LLVM_ABI SDValue getLifetimeNode(bool IsStart, const SDLoc &dl, SDValue Chain,
-                                   int FrameIndex, int64_t Size);
+                                   int FrameIndex);
 
   /// Creates a PseudoProbeSDNode with function GUID `Guid` and
   /// the index of the block `Index` it is probing, as well as the attributes
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 8e9c1f7..8f88811 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1999,23 +1999,19 @@ public:
   }
 };
 
-/// This SDNode is used for LIFETIME_START/LIFETIME_END values, which indicate
-/// the offet and size that are started/ended in the underlying FrameIndex.
+/// This SDNode is used for LIFETIME_START/LIFETIME_END values.
 class LifetimeSDNode : public SDNode {
   friend class SelectionDAG;
-  int64_t Size;
 
   LifetimeSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
-                 SDVTList VTs, int64_t Size)
-      : SDNode(Opcode, Order, dl, VTs), Size(Size) {}
+                 SDVTList VTs)
+      : SDNode(Opcode, Order, dl, VTs) {}
 
 public:
   int64_t getFrameIndex() const {
     return cast<FrameIndexSDNode>(getOperand(1))->getIndex();
   }
 
-  int64_t getSize() const { return Size; }
-
   // Methods to support isa and dyn_cast
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::LIFETIME_START ||
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index a683229..39136bc 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -101,6 +101,9 @@
 /* Define if LLVM is using tflite */
 #cmakedefine LLVM_HAVE_TFLITE
 
+/* Define if we want to check profile consistency in lit tests */
+#cmakedefine LLVM_ENABLE_PROFCHECK
+
 /* Define to 1 if you have the <sysexits.h> header file. */
 #cmakedefine HAVE_SYSEXITS_H ${HAVE_SYSEXITS_H}
 
diff --git a/llvm/include/llvm/Demangle/DemangleConfig.h b/llvm/include/llvm/Demangle/DemangleConfig.h
index 8807a0e..912c9b8 100644
--- a/llvm/include/llvm/Demangle/DemangleConfig.h
+++ b/llvm/include/llvm/Demangle/DemangleConfig.h
@@ -15,6 +15,9 @@
 #ifndef LLVM_DEMANGLE_DEMANGLECONFIG_H
 #define LLVM_DEMANGLE_DEMANGLECONFIG_H
 
+// llvm-config.h is required for LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS
+#include "llvm/Config/llvm-config.h"
+
 #ifndef __has_feature
 #define __has_feature(x) 0
 #endif
@@ -103,7 +106,7 @@
 #if defined(LLVM_EXPORTS)
 #define DEMANGLE_ABI __declspec(dllexport)
 #else
-#define DEMANGLE_ABI__declspec(dllimport)
+#define DEMANGLE_ABI __declspec(dllimport)
 #endif
 #else
 #if __has_attribute(visibility)
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index 862be04..5f7225e 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -68,6 +68,7 @@ enum DiagnosticKind {
   DK_StackSize,
   DK_Linker,
   DK_Lowering,
+  DK_LegalizationFailure,
   DK_DebugMetadataVersion,
   DK_DebugMetadataInvalid,
   DK_Instrumentation,
@@ -383,6 +384,30 @@ private:
   DiagnosticLocation Loc;
 };
 
+class LLVM_ABI DiagnosticInfoLegalizationFailure
+    : public DiagnosticInfoWithLocationBase {
+private:
+  /// Message to be reported.
+  const Twine &MsgStr;
+
+public:
+  DiagnosticInfoLegalizationFailure(const Twine &MsgStr LLVM_LIFETIME_BOUND,
+                                    const Function &Fn,
+                                    const DiagnosticLocation &Loc,
+                                    DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfoWithLocationBase(DK_LegalizationFailure, Severity, Fn,
+                                       Loc),
+        MsgStr(MsgStr) {}
+
+  const Twine &getMsgStr() const { return MsgStr; }
+
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_LegalizationFailure;
+  }
+};
+
 class LLVM_ABI DiagnosticInfoGenericWithLoc
     : public DiagnosticInfoWithLocationBase {
 private:
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 7c600e7..6d3d864 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2614,6 +2614,8 @@ public:
     return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name);
   }
 
+  Value *CreateVectorInterleave(ArrayRef<Value *> Ops, const Twine &Name = "");
+
   Value *CreateExtractValue(Value *Agg, ArrayRef<unsigned> Idxs,
                             const Twine &Name = "") {
     if (auto *V = Folder.FoldExtractValue(Agg, Idxs))
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 1568052..48735b0 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -283,8 +283,15 @@ namespace Intrinsic {
   // or of the wrong kind will be renamed by adding ".renamed" to the name.
   LLVM_ABI std::optional<Function *> remangleIntrinsicFunction(Function *F);
 
-} // End Intrinsic namespace
+  /// Returns the corresponding llvm.vector.interleaveN intrinsic for factor N.
+  LLVM_ABI Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor);
 
-} // End llvm namespace
+  /// Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor
+  /// N.
+  LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
+
+  } // namespace Intrinsic
+
+  } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index af252aa..33203ad 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -759,18 +759,18 @@ public:
 /// memory access used by the alias-analysis infrastructure.
 struct AAMDNodes {
   explicit AAMDNodes() = default;
-  explicit AAMDNodes(MDNode *T, MDNode *TS, MDNode *S, MDNode *N)
-      : TBAA(T), TBAAStruct(TS), Scope(S), NoAlias(N) {}
+  explicit AAMDNodes(MDNode *T, MDNode *TS, MDNode *S, MDNode *N, MDNode *NAS)
+      : TBAA(T), TBAAStruct(TS), Scope(S), NoAlias(N), NoAliasAddrSpace(NAS) {}
 
   bool operator==(const AAMDNodes &A) const {
     return TBAA == A.TBAA && TBAAStruct == A.TBAAStruct && Scope == A.Scope &&
-           NoAlias == A.NoAlias;
+           NoAlias == A.NoAlias && NoAliasAddrSpace == A.NoAliasAddrSpace;
   }
 
   bool operator!=(const AAMDNodes &A) const { return !(*this == A); }
 
   explicit operator bool() const {
-    return TBAA || TBAAStruct || Scope || NoAlias;
+    return TBAA || TBAAStruct || Scope || NoAlias || NoAliasAddrSpace;
   }
 
   /// The tag for type-based alias analysis.
@@ -785,6 +785,9 @@ struct AAMDNodes {
   /// The tag specifying the noalias scope.
   MDNode *NoAlias = nullptr;
 
+  /// The tag specifying the noalias address spaces.
+  MDNode *NoAliasAddrSpace = nullptr;
+
   // Shift tbaa Metadata node to start off bytes later
   LLVM_ABI static MDNode *shiftTBAA(MDNode *M, size_t off);
 
@@ -806,6 +809,8 @@ struct AAMDNodes {
     Result.TBAAStruct = Other.TBAAStruct == TBAAStruct ? TBAAStruct : nullptr;
     Result.Scope = Other.Scope == Scope ? Scope : nullptr;
     Result.NoAlias = Other.NoAlias == NoAlias ? NoAlias : nullptr;
+    Result.NoAliasAddrSpace =
+        Other.NoAliasAddrSpace == NoAliasAddrSpace ? NoAliasAddrSpace : nullptr;
     return Result;
   }
 
@@ -818,6 +823,7 @@ struct AAMDNodes {
         TBAAStruct ? shiftTBAAStruct(TBAAStruct, Offset) : nullptr;
     Result.Scope = Scope;
     Result.NoAlias = NoAlias;
+    Result.NoAliasAddrSpace = NoAliasAddrSpace;
     return Result;
   }
 
@@ -833,6 +839,7 @@ struct AAMDNodes {
     Result.TBAAStruct = TBAAStruct;
     Result.Scope = Scope;
     Result.NoAlias = NoAlias;
+    Result.NoAliasAddrSpace = NoAliasAddrSpace;
     return Result;
   }
 
@@ -860,12 +867,12 @@ struct AAMDNodes {
 template<>
 struct DenseMapInfo<AAMDNodes> {
   static inline AAMDNodes getEmptyKey() {
-    return AAMDNodes(DenseMapInfo<MDNode *>::getEmptyKey(),
-                     nullptr, nullptr, nullptr);
+    return AAMDNodes(DenseMapInfo<MDNode *>::getEmptyKey(), nullptr, nullptr,
+                     nullptr, nullptr);
   }
 
   static inline AAMDNodes getTombstoneKey() {
-    return AAMDNodes(DenseMapInfo<MDNode *>::getTombstoneKey(),
+    return AAMDNodes(DenseMapInfo<MDNode *>::getTombstoneKey(), nullptr,
                      nullptr, nullptr, nullptr);
   }
 
@@ -873,7 +880,8 @@ struct DenseMapInfo<AAMDNodes> {
     return DenseMapInfo<MDNode *>::getHashValue(Val.TBAA) ^
            DenseMapInfo<MDNode *>::getHashValue(Val.TBAAStruct) ^
            DenseMapInfo<MDNode *>::getHashValue(Val.Scope) ^
-           DenseMapInfo<MDNode *>::getHashValue(Val.NoAlias);
+           DenseMapInfo<MDNode *>::getHashValue(Val.NoAlias) ^
+           DenseMapInfo<MDNode *>::getHashValue(Val.NoAliasAddrSpace);
   }
 
   static bool isEqual(const AAMDNodes &LHS, const AAMDNodes &RHS) {
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 71da048..6c12cd3 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -35,6 +35,7 @@ class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbol;
 class MCValue;
+class Triple;
 class raw_ostream;
 
 namespace WinEH {
@@ -485,6 +486,9 @@ public:
   /// syntactically correct.
   virtual bool isValidUnquotedName(StringRef Name) const;
 
+  virtual void printSwitchToSection(const MCSection &, uint32_t Subsection,
+                                    const Triple &, raw_ostream &) const {}
+
   /// Return true if the .section directive should be omitted when
   /// emitting \p SectionName.  For example:
   ///
@@ -494,6 +498,10 @@ public:
   /// returns true  => .text
   virtual bool shouldOmitSectionDirective(StringRef SectionName) const;
 
+  // Return true if a .align directive should use "optimized nops" to fill
+  // instead of 0s.
+  virtual bool useCodeAlign(const MCSection &Sec) const { return false; }
+
   bool usesSunStyleELFSectionSwitchSyntax() const {
     return SunStyleELFSectionSwitchSyntax;
   }
diff --git a/llvm/include/llvm/MC/MCAsmInfoCOFF.h b/llvm/include/llvm/MC/MCAsmInfoCOFF.h
index 1dfb475..dc7832c 100644
--- a/llvm/include/llvm/MC/MCAsmInfoCOFF.h
+++ b/llvm/include/llvm/MC/MCAsmInfoCOFF.h
@@ -15,6 +15,9 @@ namespace llvm {
 
 class MCAsmInfoCOFF : public MCAsmInfo {
   virtual void anchor();
+  void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
+                            raw_ostream &) const final;
+  bool useCodeAlign(const MCSection &Sec) const final;
 
 protected:
   explicit MCAsmInfoCOFF();
diff --git a/llvm/include/llvm/MC/MCAsmInfoDarwin.h b/llvm/include/llvm/MC/MCAsmInfoDarwin.h
index 4ca62b3..12bc3e9 100644
--- a/llvm/include/llvm/MC/MCAsmInfoDarwin.h
+++ b/llvm/include/llvm/MC/MCAsmInfoDarwin.h
@@ -21,6 +21,9 @@ namespace llvm {
 class MCAsmInfoDarwin : public MCAsmInfo {
 public:
   explicit MCAsmInfoDarwin();
+  void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
+                            raw_ostream &) const final;
+  bool useCodeAlign(const MCSection &Sec) const final;
 
   /// True if the section is atomized using the symbols in it.
   /// This is false if the section is atomized based on its contents (MachO' __TEXT,__cstring for
diff --git a/llvm/include/llvm/MC/MCAsmInfoELF.h b/llvm/include/llvm/MC/MCAsmInfoELF.h
index 408d4df..c05e4ad 100644
--- a/llvm/include/llvm/MC/MCAsmInfoELF.h
+++ b/llvm/include/llvm/MC/MCAsmInfoELF.h
@@ -16,6 +16,9 @@ namespace llvm {
 class MCAsmInfoELF : public MCAsmInfo {
   virtual void anchor();
   MCSection *getNonexecutableStackSection(MCContext &Ctx) const final;
+  void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
+                            raw_ostream &) const final;
+  bool useCodeAlign(const MCSection &Sec) const final;
 
 protected:
   MCAsmInfoELF();
diff --git a/llvm/include/llvm/MC/MCAsmInfoGOFF.h b/llvm/include/llvm/MC/MCAsmInfoGOFF.h
index 1f3b263..e62d2ae 100644
--- a/llvm/include/llvm/MC/MCAsmInfoGOFF.h
+++ b/llvm/include/llvm/MC/MCAsmInfoGOFF.h
@@ -19,7 +19,8 @@
 
 namespace llvm {
 class MCAsmInfoGOFF : public MCAsmInfo {
-  virtual void anchor();
+  void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
+                            raw_ostream &) const final;
 
 protected:
   MCAsmInfoGOFF();
diff --git a/llvm/include/llvm/MC/MCAsmInfoWasm.h b/llvm/include/llvm/MC/MCAsmInfoWasm.h
index 3afc610..d98de6c 100644
--- a/llvm/include/llvm/MC/MCAsmInfoWasm.h
+++ b/llvm/include/llvm/MC/MCAsmInfoWasm.h
@@ -13,7 +13,8 @@
 
 namespace llvm {
 class MCAsmInfoWasm : public MCAsmInfo {
-  virtual void anchor();
+  void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
+                            raw_ostream &) const final;
 
 protected:
   MCAsmInfoWasm();
diff --git a/llvm/include/llvm/MC/MCAsmInfoXCOFF.h b/llvm/include/llvm/MC/MCAsmInfoXCOFF.h
index 5483899..fd1ae82 100644
--- a/llvm/include/llvm/MC/MCAsmInfoXCOFF.h
+++ b/llvm/include/llvm/MC/MCAsmInfoXCOFF.h
@@ -14,10 +14,11 @@
 namespace llvm {
 
 class MCAsmInfoXCOFF : public MCAsmInfo {
-  virtual void anchor();
-
 protected:
   MCAsmInfoXCOFF();
+  void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
+                            raw_ostream &) const final;
+  bool useCodeAlign(const MCSection &Sec) const final;
 
 public:
   // Return true only when C is an acceptable character inside a
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index c137f61..ddac161 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -333,8 +333,6 @@ private:
   void reportCommon(SMLoc Loc,
                     std::function<void(SMDiagnostic &, const SourceMgr *)>);
 
-  MCFragment *allocInitialFragment(MCSection &Sec);
-
   MCSymbolTableEntry &getSymbolTableEntry(StringRef Name);
 
   MCSymbol *createSymbolImpl(const MCSymbolTableEntry *Name, bool IsTemporary);
diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h
index 51e4df5..170e2e7 100644
--- a/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -16,7 +16,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/EndianStream.h"
@@ -276,7 +276,7 @@ public:
                                uint64_t SectionDataSize, uint32_t MaxProt,
                                uint32_t InitProt);
 
-  void writeSection(const MCAssembler &Asm, const MCSection &Sec,
+  void writeSection(const MCAssembler &Asm, const MCSectionMachO &Sec,
                     uint64_t VMAddr, uint64_t FileOffset, unsigned Flags,
                     uint64_t RelocationsStart, unsigned NumRelocations);
 
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index aea93e9..5ac7aba 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -52,6 +52,10 @@ class MCObjectStreamer : public MCStreamer {
   DenseMap<const MCSymbol *, SmallVector<PendingAssignment, 1>>
       pendingAssignments;
 
+  SmallVector<std::unique_ptr<char[]>, 0> FragStorage;
+  // Available bytes in the current block for trailing data or new fragments.
+  size_t FragSpace = 0;
+
   void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &);
   void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
@@ -84,10 +88,18 @@ public:
   // Add a fragment with a variable-size tail and start a new empty fragment.
   void insert(MCFragment *F);
 
+  char *getCurFragEnd() const {
+    return reinterpret_cast<char *>(CurFrag + 1) + CurFrag->getFixedSize();
+  }
+  MCFragment *allocFragSpace(size_t Headroom);
   // Add a new fragment to the current section without a variable-size tail.
   void newFragment();
 
+  void ensureHeadroom(size_t Headroom);
+  void appendContents(ArrayRef<char> Contents);
   void appendContents(size_t Num, char Elt);
+  // Add a fixup to the current fragment. Call ensureHeadroom beforehand to
+  // ensure the fixup and appended content apply to the same fragment.
   void addFixup(const MCExpr *Value, MCFixupKind Kind);
 
   void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
@@ -102,7 +114,6 @@ public:
   void emitSLEB128Value(const MCExpr *Value) override;
   void emitWeakReference(MCSymbol *Alias, const MCSymbol *Target) override;
   void changeSection(MCSection *Section, uint32_t Subsection = 0) override;
-  void switchSectionNoPrint(MCSection *Section) override;
   void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
   /// Emit an instruction to a special fragment, because this instruction
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index c1f3f02..3bd986f 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -93,8 +93,7 @@ protected:
   // Track content and fixups for the fixed-size part as fragments are
   // appended to the section. The content remains immutable, except when
   // modified by applyFixup.
-  uint32_t ContentStart = 0;
-  uint32_t ContentEnd = 0;
+  uint32_t FixedSize = 0;
   uint32_t FixupStart = 0;
   uint32_t FixupEnd = 0;
 
@@ -150,23 +149,6 @@ public:
   MCFragment(const MCFragment &) = delete;
   MCFragment &operator=(const MCFragment &) = delete;
 
-  bool isEncoded() const {
-    MCFragment::FragmentType Kind = getKind();
-    switch (Kind) {
-    default:
-      return false;
-    case MCFragment::FT_Relaxable:
-    case MCFragment::FT_Data:
-    case MCFragment::FT_Align:
-    case MCFragment::FT_Dwarf:
-    case MCFragment::FT_DwarfFrame:
-    case MCFragment::FT_LEB:
-    case MCFragment::FT_CVInlineLines:
-    case MCFragment::FT_CVDefRange:
-      return true;
-    }
-  }
-
   MCFragment *getNext() const { return Next; }
 
   FragmentType getKind() const { return Kind; }
@@ -205,18 +187,6 @@ public:
   //== Content-related functions manage parent's storage using ContentStart and
   // ContentSize.
 
-  // Get a SmallVector reference. The caller should call doneAppending to update
-  // `ContentEnd`.
-  SmallVectorImpl<char> &getContentsForAppending();
-  void doneAppending();
-  void appendContents(ArrayRef<char> Contents) {
-    getContentsForAppending().append(Contents.begin(), Contents.end());
-    doneAppending();
-  }
-  void appendContents(size_t Num, char Elt) {
-    getContentsForAppending().append(Num, Elt);
-    doneAppending();
-  }
   MutableArrayRef<char> getContents();
   ArrayRef<char> getContents() const;
 
@@ -225,10 +195,10 @@ public:
   MutableArrayRef<char> getVarContents();
   ArrayRef<char> getVarContents() const;
 
-  size_t getFixedSize() const { return ContentEnd - ContentStart; }
+  size_t getFixedSize() const { return FixedSize; }
   size_t getVarSize() const { return VarContentEnd - VarContentStart; }
   size_t getSize() const {
-    return ContentEnd - ContentStart + (VarContentEnd - VarContentStart);
+    return FixedSize + (VarContentEnd - VarContentStart);
   }
 
   //== Fixup-related functions manage parent's storage using FixupStart and
@@ -540,17 +510,6 @@ public:
   friend class MCFragment;
   static constexpr unsigned NonUniqueID = ~0U;
 
-  enum SectionVariant {
-    SV_COFF = 0,
-    SV_ELF,
-    SV_GOFF,
-    SV_MachO,
-    SV_Wasm,
-    SV_XCOFF,
-    SV_SPIRV,
-    SV_DXContainer,
-  };
-
   struct iterator {
     MCFragment *F = nullptr;
     iterator() = default;
@@ -570,6 +529,8 @@ private:
   // At parse time, this holds the fragment list of the current subsection. At
   // layout time, this holds the concatenated fragment lists of all subsections.
   FragList *CurFragList;
+  // In many object file formats, this denotes the section symbol. In Mach-O,
+  // this denotes an optional temporary label at the section start.
   MCSymbol *Begin;
   MCSymbol *End = nullptr;
   /// The alignment requirement of this section.
@@ -589,6 +550,8 @@ private:
   /// offset between two locations may not be fully resolved.
   bool LinkerRelaxable : 1;
 
+  MCFragment DummyFragment;
+
   // Mapping from subsection number to fragment list. At layout time, the
   // subsection 0 list is replaced with concatenated fragments from all
   // subsections.
@@ -602,12 +565,8 @@ private:
 protected:
   // TODO Make Name private when possible.
   StringRef Name;
-  SectionVariant Variant;
 
-  MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
-            MCSymbol *Begin);
-  // Protected non-virtual dtor prevents destroy through a base class pointer.
-  ~MCSection() {}
+  MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin);
 
 public:
   MCSection(const MCSection &) = delete;
@@ -616,8 +575,6 @@ public:
   StringRef getName() const { return Name; }
   bool isText() const { return IsText; }
 
-  SectionVariant getVariant() const { return Variant; }
-
   MCSymbol *getBeginSymbol() { return Begin; }
   const MCSymbol *getBeginSymbol() const {
     return const_cast<MCSection *>(this)->getBeginSymbol();
@@ -650,7 +607,7 @@ public:
   bool isLinkerRelaxable() const { return LinkerRelaxable; }
   void setLinkerRelaxable() { LinkerRelaxable = true; }
 
-  MCFragment &getDummyFragment() { return *Subsections[0].second.Head; }
+  MCFragment &getDummyFragment() { return DummyFragment; }
 
   FragList *curFragList() const { return CurFragList; }
   iterator begin() const { return iterator(CurFragList->Head); }
@@ -659,41 +616,16 @@ public:
   void dump(DenseMap<const MCFragment *, SmallVector<const MCSymbol *, 0>>
                 *FragToSyms = nullptr) const;
 
-  virtual void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                                    raw_ostream &OS,
-                                    uint32_t Subsection) const = 0;
-
-  /// Return true if a .align directive should use "optimized nops" to fill
-  /// instead of 0s.
-  virtual bool useCodeAlign() const = 0;
-
   /// Check whether this section is "virtual", that is has no actual object
   /// file contents.
   bool isBssSection() const { return IsBss; }
 };
 
-inline SmallVectorImpl<char> &MCFragment::getContentsForAppending() {
-  SmallVectorImpl<char> &S = getParent()->ContentStorage;
-  if (LLVM_UNLIKELY(ContentEnd != S.size())) {
-    // Move the elements to the end. Reserve space to avoid invalidating
-    // S.begin()+I for `append`.
-    auto Size = ContentEnd - ContentStart;
-    auto I = std::exchange(ContentStart, S.size());
-    S.reserve(S.size() + Size);
-    S.append(S.begin() + I, S.begin() + I + Size);
-  }
-  return S;
-}
-inline void MCFragment::doneAppending() {
-  ContentEnd = getParent()->ContentStorage.size();
-}
 inline MutableArrayRef<char> MCFragment::getContents() {
-  return MutableArrayRef(getParent()->ContentStorage)
-      .slice(ContentStart, ContentEnd - ContentStart);
+  return {reinterpret_cast<char *>(this + 1), FixedSize};
 }
 inline ArrayRef<char> MCFragment::getContents() const {
-  return ArrayRef(getParent()->ContentStorage)
-      .slice(ContentStart, ContentEnd - ContentStart);
+  return {reinterpret_cast<const char *>(this + 1), FixedSize};
 }
 
 inline MutableArrayRef<char> MCFragment::getVarContents() {
diff --git a/llvm/include/llvm/MC/MCSectionCOFF.h b/llvm/include/llvm/MC/MCSectionCOFF.h
index f979413a..71efc41 100644
--- a/llvm/include/llvm/MC/MCSectionCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionCOFF.h
@@ -51,11 +51,12 @@ class MCSectionCOFF final : public MCSection {
 
 private:
   friend class MCContext;
+  friend class MCAsmInfoCOFF;
   // The storage of Name is owned by MCContext's COFFUniquingMap.
   MCSectionCOFF(StringRef Name, unsigned Characteristics,
                 MCSymbol *COMDATSymbol, int Selection, unsigned UniqueID,
                 MCSymbol *Begin)
-      : MCSection(SV_COFF, Name, Characteristics & COFF::IMAGE_SCN_CNT_CODE,
+      : MCSection(Name, Characteristics & COFF::IMAGE_SCN_CNT_CODE,
                   Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA,
                   Begin),
         Characteristics(Characteristics), COMDATSymbol(COMDATSymbol),
@@ -67,7 +68,7 @@ private:
 public:
   /// Decides whether a '.section' directive should be printed before the
   /// section name
-  bool shouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
+  bool shouldOmitSectionDirective(StringRef Name) const;
 
   unsigned getCharacteristics() const { return Characteristics; }
   MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; }
@@ -78,11 +79,6 @@ public:
   bool isUnique() const { return UniqueID != NonUniqueID; }
   unsigned getUniqueID() const { return UniqueID; }
 
-  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            uint32_t Subsection) const override;
-  bool useCodeAlign() const override;
-
   unsigned getOrAssignWinCFISectionID(unsigned *NextID) const {
     if (WinCFISectionID == ~0U)
       WinCFISectionID = (*NextID)++;
@@ -92,8 +88,6 @@ public:
   static bool isImplicitlyDiscardable(StringRef Name) {
     return Name.starts_with(".debug");
   }
-
-  static bool classof(const MCSection *S) { return S->getVariant() == SV_COFF; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionDXContainer.h b/llvm/include/llvm/MC/MCSectionDXContainer.h
index 723b477..7d8e0c5 100644
--- a/llvm/include/llvm/MC/MCSectionDXContainer.h
+++ b/llvm/include/llvm/MC/MCSectionDXContainer.h
@@ -24,13 +24,7 @@ class MCSectionDXContainer final : public MCSection {
   friend class MCContext;
 
   MCSectionDXContainer(StringRef Name, SectionKind K, MCSymbol *Begin)
-      : MCSection(SV_DXContainer, Name, K.isText(), /*IsVirtual=*/false,
-                  Begin) {}
-
-public:
-  void printSwitchToSection(const MCAsmInfo &, const Triple &, raw_ostream &,
-                            uint32_t) const override;
-  bool useCodeAlign() const override { return false; }
+      : MCSection(Name, K.isText(), /*IsVirtual=*/false, Begin) {}
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionELF.h b/llvm/include/llvm/MC/MCSectionELF.h
index 64a4daf..f089dd9 100644
--- a/llvm/include/llvm/MC/MCSectionELF.h
+++ b/llvm/include/llvm/MC/MCSectionELF.h
@@ -52,14 +52,15 @@ class MCSectionELF final : public MCSection {
 
 private:
   friend class MCContext;
+  friend class MCAsmInfoELF;
 
   // The storage of Name is owned by MCContext's ELFUniquingMap.
   MCSectionELF(StringRef Name, unsigned type, unsigned flags,
                unsigned entrySize, const MCSymbolELF *group, bool IsComdat,
                unsigned UniqueID, MCSymbol *Begin,
                const MCSymbolELF *LinkedToSym)
-      : MCSection(SV_ELF, Name, flags & ELF::SHF_EXECINSTR,
-                  type == ELF::SHT_NOBITS, Begin),
+      : MCSection(Name, flags & ELF::SHF_EXECINSTR, type == ELF::SHT_NOBITS,
+                  Begin),
         Type(type), Flags(flags), UniqueID(UniqueID), EntrySize(entrySize),
         Group(group, IsComdat), LinkedToSym(LinkedToSym) {
     assert((!(Flags & ELF::SHF_GROUP) || Group.getPointer()) &&
@@ -69,10 +70,6 @@ private:
   }
 
 public:
-  /// Decides whether a '.section' directive should be printed before the
-  /// section name
-  bool shouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
-
   unsigned getType() const { return Type; }
   unsigned getFlags() const { return Flags; }
   unsigned getEntrySize() const { return EntrySize; }
@@ -80,11 +77,6 @@ public:
   const MCSymbolELF *getGroup() const { return Group.getPointer(); }
   bool isComdat() const { return Group.getInt(); }
 
-  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            uint32_t Subsection) const override;
-  bool useCodeAlign() const override;
-
   bool isUnique() const { return UniqueID != NonUniqueID; }
   unsigned getUniqueID() const { return UniqueID; }
 
@@ -100,10 +92,6 @@ public:
   std::pair<uint64_t, uint64_t> getOffsets() const {
     return std::make_pair(StartOffset, EndOffset);
   }
-
-  static bool classof(const MCSection *S) {
-    return S->getVariant() == SV_ELF;
-  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionGOFF.h b/llvm/include/llvm/MC/MCSectionGOFF.h
index b166397..2136148 100644
--- a/llvm/include/llvm/MC/MCSectionGOFF.h
+++ b/llvm/include/llvm/MC/MCSectionGOFF.h
@@ -52,36 +52,28 @@ class LLVM_ABI MCSectionGOFF final : public MCSection {
   mutable unsigned Emitted : 1;
 
   friend class MCContext;
+  friend class MCAsmInfoGOFF;
   friend class MCSymbolGOFF;
 
   MCSectionGOFF(StringRef Name, SectionKind K, bool IsVirtual,
                 GOFF::SDAttr SDAttributes, MCSectionGOFF *Parent)
-      : MCSection(SV_GOFF, Name, K.isText(), IsVirtual, nullptr),
-        Parent(Parent), SDAttributes(SDAttributes),
-        SymbolType(GOFF::ESD_ST_SectionDefinition), IsBSS(K.isBSS()),
-        RequiresNonZeroLength(0), Emitted(0) {}
+      : MCSection(Name, K.isText(), IsVirtual, nullptr), Parent(Parent),
+        SDAttributes(SDAttributes), SymbolType(GOFF::ESD_ST_SectionDefinition),
+        IsBSS(K.isBSS()), RequiresNonZeroLength(0), Emitted(0) {}
 
   MCSectionGOFF(StringRef Name, SectionKind K, bool IsVirtual,
                 GOFF::EDAttr EDAttributes, MCSectionGOFF *Parent)
-      : MCSection(SV_GOFF, Name, K.isText(), IsVirtual, nullptr),
-        Parent(Parent), EDAttributes(EDAttributes),
-        SymbolType(GOFF::ESD_ST_ElementDefinition), IsBSS(K.isBSS()),
-        RequiresNonZeroLength(0), Emitted(0) {}
+      : MCSection(Name, K.isText(), IsVirtual, nullptr), Parent(Parent),
+        EDAttributes(EDAttributes), SymbolType(GOFF::ESD_ST_ElementDefinition),
+        IsBSS(K.isBSS()), RequiresNonZeroLength(0), Emitted(0) {}
 
   MCSectionGOFF(StringRef Name, SectionKind K, bool IsVirtual,
                 GOFF::PRAttr PRAttributes, MCSectionGOFF *Parent)
-      : MCSection(SV_GOFF, Name, K.isText(), IsVirtual, nullptr),
-        Parent(Parent), PRAttributes(PRAttributes),
-        SymbolType(GOFF::ESD_ST_PartReference), IsBSS(K.isBSS()),
-        RequiresNonZeroLength(0), Emitted(0) {}
+      : MCSection(Name, K.isText(), IsVirtual, nullptr), Parent(Parent),
+        PRAttributes(PRAttributes), SymbolType(GOFF::ESD_ST_PartReference),
+        IsBSS(K.isBSS()), RequiresNonZeroLength(0), Emitted(0) {}
 
 public:
-  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            uint32_t Subsection) const override;
-
-  bool useCodeAlign() const override { return false; }
-
   // Return the parent section.
   MCSectionGOFF *getParent() const { return Parent; }
 
@@ -123,8 +115,6 @@ public:
   bool requiresNonZeroLength() const { return RequiresNonZeroLength; }
 
   void setName(StringRef SectionName) { Name = SectionName; }
-
-  static bool classof(const MCSection *S) { return S->getVariant() == SV_GOFF; }
 };
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/MC/MCSectionMachO.h b/llvm/include/llvm/MC/MCSectionMachO.h
index 4312175..a65d7e0 100644
--- a/llvm/include/llvm/MC/MCSectionMachO.h
+++ b/llvm/include/llvm/MC/MCSectionMachO.h
@@ -23,6 +23,8 @@ namespace llvm {
 /// This represents a section on a Mach-O system (used by Mac OS X).  On a Mac
 /// system, these are also described in /usr/include/mach-o/loader.h.
 class LLVM_ABI MCSectionMachO final : public MCSection {
+  friend class MCContext;
+  friend class MCAsmInfoDarwin;
   char SegmentName[16];  // Not necessarily null terminated!
 
   /// This is the SECTION_TYPE and SECTION_ATTRIBUTES field of a section, drawn
@@ -42,7 +44,6 @@ class LLVM_ABI MCSectionMachO final : public MCSection {
 
   MCSectionMachO(StringRef Segment, StringRef Section, unsigned TAA,
                  unsigned reserved2, SectionKind K, MCSymbol *Begin);
-  friend class MCContext;
 public:
 
   StringRef getSegmentName() const {
@@ -76,21 +77,12 @@ public:
                                      bool &TAAParsed,     // Out.
                                      unsigned &StubSize); // Out.
 
-  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            uint32_t Subsection) const override;
-  bool useCodeAlign() const override;
-
   void allocAtoms();
   const MCSymbol *getAtom(size_t I) const;
   void setAtom(size_t I, const MCSymbol *Sym);
 
   unsigned getLayoutOrder() const { return LayoutOrder; }
   void setLayoutOrder(unsigned Value) { LayoutOrder = Value; }
-
-  static bool classof(const MCSection *S) {
-    return S->getVariant() == SV_MachO;
-  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionSPIRV.h b/llvm/include/llvm/MC/MCSectionSPIRV.h
index 091114a..6850965 100644
--- a/llvm/include/llvm/MC/MCSectionSPIRV.h
+++ b/llvm/include/llvm/MC/MCSectionSPIRV.h
@@ -18,22 +18,13 @@
 
 namespace llvm {
 
-class MCSymbol;
-
 class MCSectionSPIRV final : public MCSection {
   friend class MCContext;
 
   MCSectionSPIRV()
-      : MCSection(SV_SPIRV, "", /*IsText=*/true, /*IsVirtual=*/false,
+      : MCSection("", /*IsText=*/true, /*IsVirtual=*/false,
                   /*Begin=*/nullptr) {}
   // TODO: Add StringRef Name to MCSectionSPIRV.
-
-public:
-  ~MCSectionSPIRV() = default;
-  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            uint32_t Subsection) const override {}
-  bool useCodeAlign() const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionWasm.h b/llvm/include/llvm/MC/MCSectionWasm.h
index 4523818..5ec01ed 100644
--- a/llvm/include/llvm/MC/MCSectionWasm.h
+++ b/llvm/include/llvm/MC/MCSectionWasm.h
@@ -49,26 +49,18 @@ class MCSectionWasm final : public MCSection {
 
   // The storage of Name is owned by MCContext's WasmUniquingMap.
   friend class MCContext;
+  friend class MCAsmInfoWasm;
   MCSectionWasm(StringRef Name, SectionKind K, unsigned SegmentFlags,
                 const MCSymbolWasm *Group, unsigned UniqueID, MCSymbol *Begin)
-      : MCSection(SV_Wasm, Name, K.isText(), /*IsVirtual=*/false, Begin),
+      : MCSection(Name, K.isText(), /*IsVirtual=*/false, Begin),
         UniqueID(UniqueID), Group(Group),
         IsWasmData(K.isReadOnly() || K.isWriteable()),
         IsMetadata(K.isMetadata()), SegmentFlags(SegmentFlags) {}
 
 public:
-  /// Decides whether a '.section' directive should be printed before the
-  /// section name
-  bool shouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
-
   const MCSymbolWasm *getGroup() const { return Group; }
   unsigned getSegmentFlags() const { return SegmentFlags; }
 
-  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            uint32_t Subsection) const override;
-  bool useCodeAlign() const override;
-
   bool isWasmData() const { return IsWasmData; }
   bool isMetadata() const { return IsMetadata; }
 
@@ -89,7 +81,6 @@ public:
     assert(isWasmData());
     IsPassive = V;
   }
-  static bool classof(const MCSection *S) { return S->getVariant() == SV_Wasm; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCSectionXCOFF.h b/llvm/include/llvm/MC/MCSectionXCOFF.h
index 499df6b5..0571f95 100644
--- a/llvm/include/llvm/MC/MCSectionXCOFF.h
+++ b/llvm/include/llvm/MC/MCSectionXCOFF.h
@@ -31,6 +31,7 @@ namespace llvm {
 //    implemented yet.
 class MCSectionXCOFF final : public MCSection {
   friend class MCContext;
+  friend class MCAsmInfoXCOFF;
 
   std::optional<XCOFF::CsectProperties> CsectProp;
   MCSymbolXCOFF *const QualName;
@@ -46,7 +47,7 @@ class MCSectionXCOFF final : public MCSection {
                  XCOFF::SymbolType ST, SectionKind K, MCSymbolXCOFF *QualName,
                  MCSymbol *Begin, StringRef SymbolTableName,
                  bool MultiSymbolsAllowed)
-      : MCSection(SV_XCOFF, Name, K.isText(),
+      : MCSection(Name, K.isText(),
                   /*IsVirtual=*/ST == XCOFF::XTY_CM && SMC != XCOFF::XMC_TD,
                   Begin),
         CsectProp(XCOFF::CsectProperties(SMC, ST)), QualName(QualName),
@@ -77,7 +78,7 @@ class MCSectionXCOFF final : public MCSection {
                  XCOFF::DwarfSectionSubtypeFlags DwarfSubtypeFlags,
                  MCSymbol *Begin, StringRef SymbolTableName,
                  bool MultiSymbolsAllowed)
-      : MCSection(SV_XCOFF, Name, K.isText(), /*IsVirtual=*/false, Begin),
+      : MCSection(Name, K.isText(), /*IsVirtual=*/false, Begin),
         QualName(QualName), SymbolTableName(SymbolTableName),
         DwarfSubtypeFlags(DwarfSubtypeFlags),
         MultiSymbolsAllowed(MultiSymbolsAllowed), Kind(K) {
@@ -95,10 +96,6 @@ class MCSectionXCOFF final : public MCSection {
 public:
   ~MCSectionXCOFF();
 
-  static bool classof(const MCSection *S) {
-    return S->getVariant() == SV_XCOFF;
-  }
-
   XCOFF::StorageMappingClass getMappingClass() const {
     assert(isCsect() && "Only csect section has mapping class property!");
     return CsectProp->MappingClass;
@@ -115,10 +112,6 @@ public:
   }
   MCSymbolXCOFF *getQualNameSymbol() const { return QualName; }
 
-  void printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                            raw_ostream &OS,
-                            uint32_t Subsection) const override;
-  bool useCodeAlign() const override;
   StringRef getSymbolTableName() const { return SymbolTableName; }
   void setSymbolTableName(StringRef STN) { SymbolTableName = STN; }
   bool isMultiSymbolsAllowed() const { return MultiSymbolsAllowed; }
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index dfaf348..79c715e 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -438,7 +438,7 @@ public:
     assert(!CurFrag || CurFrag->getKind() == MCFragment::FT_Data);
     return CurFrag;
   }
-  size_t getCurFragOffset() const { return getCurrentFragment()->Offset; }
+  size_t getCurFragSize() const { return getCurrentFragment()->getFixedSize(); }
   /// Save the current and previous section on the section stack.
   void pushSection() {
     SectionStack.push_back(
@@ -459,7 +459,7 @@ public:
   bool switchSection(MCSection *Section, const MCExpr *);
 
   /// Similar to switchSection, but does not print the section directive.
-  virtual void switchSectionNoPrint(MCSection *Section);
+  void switchSectionNoPrint(MCSection *Section);
 
   /// Create the default sections and set the initial one.
   virtual void initSections(bool NoExecStack, const MCSubtargetInfo &STI);
diff --git a/llvm/include/llvm/MC/MCSymbolELF.h b/llvm/include/llvm/MC/MCSymbolELF.h
index eba9964..7c271e7 100644
--- a/llvm/include/llvm/MC/MCSymbolELF.h
+++ b/llvm/include/llvm/MC/MCSymbolELF.h
@@ -13,6 +13,7 @@
 
 namespace llvm {
 class MCSymbolELF : public MCSymbol {
+  friend class MCAsmInfoELF;
   /// An expression describing how to calculate the size of a symbol. If a
   /// symbol has no size this field will be NULL.
   const MCExpr *SymbolSize = nullptr;
diff --git a/llvm/include/llvm/MC/MCXCOFFStreamer.h b/llvm/include/llvm/MC/MCXCOFFStreamer.h
index 870d48f..c3bc2ca9 100644
--- a/llvm/include/llvm/MC/MCXCOFFStreamer.h
+++ b/llvm/include/llvm/MC/MCXCOFFStreamer.h
@@ -22,6 +22,7 @@ public:
 
   XCOFFObjectWriter &getWriter();
 
+  void changeSection(MCSection *Section, uint32_t Subsection = 0) override;
   bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         Align ByteAlignment) override;
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index a119b07..8f36739 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -223,7 +223,8 @@ enum : int32_t {
   KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1),
   KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
   KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
-  KERNEL_CODE_PROPERTY(RESERVED0, 7, 3),
+  KERNEL_CODE_PROPERTY(RESERVED0, 7, 2),
+  KERNEL_CODE_PROPERTY(USES_CU_STORES, 9, 1),           // GFX12.5 +cu-stores
   KERNEL_CODE_PROPERTY(ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+
   KERNEL_CODE_PROPERTY(USES_DYNAMIC_STACK, 11, 1),
   KERNEL_CODE_PROPERTY(RESERVED1, 12, 4),
diff --git a/llvm/include/llvm/Support/Debug.h b/llvm/include/llvm/Support/Debug.h
index 924d7b2..5542089 100644
--- a/llvm/include/llvm/Support/Debug.h
+++ b/llvm/include/llvm/Support/Debug.h
@@ -39,13 +39,19 @@ class raw_ostream;
 /// isCurrentDebugType - Return true if the specified string is the debug type
 /// specified on the command line, or if none was specified on the command line
 /// with the -debug-only=X option.
-///
-bool isCurrentDebugType(const char *Type);
+/// An optional level can be provided to control the verbosity of the output.
+/// If the provided level is not 0 and user specified a level below the provided
+/// level, return false.
+bool isCurrentDebugType(const char *Type, int Level = 0);
 
 /// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
 /// option were specified.  Note that DebugFlag also needs to be set to true for
 /// debug output to be produced.
-///
+/// The debug type format is "type[:level]", where the level is an optional
+/// integer. If a level is provided, the debug output is enabled only if the
+/// user specified a level at least as high as the provided level.
+/// 0 is a special level that acts as an opt-out for this specific debug type
+/// without affecting the other debug output.
 void setCurrentDebugType(const char *Type);
 
 /// setCurrentDebugTypes - Set the current debug type, as if the
diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h
index 9556bf2..8fca2d5 100644
--- a/llvm/include/llvm/Support/DebugLog.h
+++ b/llvm/include/llvm/Support/DebugLog.h
@@ -19,47 +19,154 @@
 namespace llvm {
 #ifndef NDEBUG
 
-// Output with given inputs and trailing newline. E.g.,
+// LDBG() is a macro that can be used as a raw_ostream for debugging.
+// It will stream the output to the dbgs() stream, with a prefix of the
+// debug type and the file and line number. A trailing newline is added to the
+// output automatically. If the streamed content contains a newline, the prefix
+// is added to each beginning of a new line. Nothing is printed if the debug
+// output is not enabled or the debug type does not match.
+//
+// E.g.,
 //   LDBG() << "Bitset contains: " << Bitset;
-// is equivalent to
-//   LLVM_DEBUG(dbgs() << DEBUG_TYPE << " [" << __FILE__ << ":" << __LINE__
-//              << "] " << "Bitset contains: " << Bitset << "\n");
-#define LDBG() DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), DEBUG_TYPE)
+// is somehow equivalent to
+//   LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] " << __FILE__ << ":" <<
+//   __LINE__ << " "
+//              << "Bitset contains: " << Bitset << "\n");
+//
+// An optional `level` argument can be provided to control the verbosity of the
+// output. The default level is 1, and is in increasing level of verbosity.
+//
+// The `level` argument can be a literal integer, or a macro that evaluates to
+// an integer.
+//
+#define LDBG(...) _GET_LDBG_MACRO(__VA_ARGS__)(__VA_ARGS__)
+
+// Helper macros to choose the correct macro based on the number of arguments.
+#define LDBG_FUNC_CHOOSER(_f1, _f2, ...) _f2
+#define LDBG_FUNC_RECOMPOSER(argsWithParentheses)                              \
+  LDBG_FUNC_CHOOSER argsWithParentheses
+#define LDBG_CHOOSE_FROM_ARG_COUNT(...)                                        \
+  LDBG_FUNC_RECOMPOSER((__VA_ARGS__, LDBG_LOG_LEVEL, ))
+#define LDBG_NO_ARG_EXPANDER() , LDBG_LOG_LEVEL_1
+#define _GET_LDBG_MACRO(...)                                                   \
+  LDBG_CHOOSE_FROM_ARG_COUNT(LDBG_NO_ARG_EXPANDER __VA_ARGS__())
 
-#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, TYPE)                            \
-  for (bool _c = (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)); _c;  \
-       _c = false)                                                             \
-  ::llvm::impl::LogWithNewline(TYPE, __FILE__, __LINE__, (STREAM))
+// Dispatch macros to support the `level` argument or none (default to 1)
+#define LDBG_LOG_LEVEL(LEVEL)                                                  \
+  DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), LEVEL, DEBUG_TYPE)
+#define LDBG_LOG_LEVEL_1() LDBG_LOG_LEVEL(1)
+
+#define DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE,     \
+                                                LINE)                          \
+  for (bool _c =                                                               \
+           (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE, LEVEL));     \
+       _c; _c = false)                                                         \
+  ::llvm::impl::raw_ldbg_ostream{                                              \
+      ::llvm::impl::computePrefix(TYPE, FILE, LINE, LEVEL), (STREAM)}          \
+      .asLvalue()
+
+#define DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, FILE)          \
+  DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE, __LINE__)
+// When __SHORT_FILE__ is not defined, the File is the full path,
+// otherwise __SHORT_FILE__ is defined in CMake to provide the file name
+// without the path prefix.
+#if defined(__SHORT_FILE__)
+#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE)                     \
+  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, __SHORT_FILE__)
+#else
+#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE)                     \
+  DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE,                      \
+                                     ::llvm::impl::getShortFileName(__FILE__))
+#endif
 
 namespace impl {
-class LogWithNewline {
+
+/// A raw_ostream that tracks `\n` and print the prefix.
+class LLVM_ABI raw_ldbg_ostream final : public raw_ostream {
+  std::string Prefix;
+  raw_ostream &Os;
+  bool HasPendingNewline = true;
+
+  /// Split the line on newlines and insert the prefix before each newline.
+  /// Forward everything to the underlying stream.
+  void write_impl(const char *Ptr, size_t Size) final {
+    auto Str = StringRef(Ptr, Size);
+    // Handle the initial prefix.
+    if (!Str.empty())
+      writeWithPrefix(StringRef());
+
+    auto Eol = Str.find('\n');
+    while (Eol != StringRef::npos) {
+      StringRef Line = Str.take_front(Eol + 1);
+      if (!Line.empty())
+        writeWithPrefix(Line);
+      HasPendingNewline = true;
+      Str = Str.drop_front(Eol + 1);
+      Eol = Str.find('\n');
+    }
+    if (!Str.empty())
+      writeWithPrefix(Str);
+  }
+  void emitPrefix() { Os.write(Prefix.c_str(), Prefix.size()); }
+  void writeWithPrefix(StringRef Str) {
+    if (HasPendingNewline) {
+      emitPrefix();
+      HasPendingNewline = false;
+    }
+    Os.write(Str.data(), Str.size());
+  }
+
 public:
-  LogWithNewline(const char *debug_type, const char *file, int line,
-                 raw_ostream &os)
-      : os(os) {
-    if (debug_type)
-      os << debug_type << " ";
-    os << "[" << file << ":" << line << "] ";
+  explicit raw_ldbg_ostream(std::string Prefix, raw_ostream &Os)
+      : Prefix(std::move(Prefix)), Os(Os) {
+    SetUnbuffered();
   }
-  ~LogWithNewline() { os << '\n'; }
-  template <typename T> raw_ostream &operator<<(const T &t) && {
-    return os << t;
+  ~raw_ldbg_ostream() final {
+    flushEol();
+    Os << '\n';
+  }
+  void flushEol() {
+    if (HasPendingNewline) {
+      emitPrefix();
+      HasPendingNewline = false;
+    }
   }
 
-  // Prevent copying, as this class manages newline responsibility and is
-  // intended for use as a temporary.
-  LogWithNewline(const LogWithNewline &) = delete;
-  LogWithNewline &operator=(const LogWithNewline &) = delete;
-  LogWithNewline &operator=(LogWithNewline &&) = delete;
+  /// Forward the current_pos method to the underlying stream.
+  uint64_t current_pos() const final { return Os.tell(); }
 
-private:
-  raw_ostream &os;
+  /// Some of the `<<` operators expect an lvalue, so we trick the type system.
+  raw_ldbg_ostream &asLvalue() { return *this; }
 };
+
+/// Remove the path prefix from the file name.
+static LLVM_ATTRIBUTE_UNUSED constexpr const char *
+getShortFileName(const char *path) {
+  const char *filename = path;
+  for (const char *p = path; *p != '\0'; ++p) {
+    if (*p == '/' || *p == '\\')
+      filename = p + 1;
+  }
+  return filename;
+}
+
+/// Compute the prefix for the debug log in the form of:
+/// "[DebugType] File:Line "
+/// Where the File is the file name without the path prefix.
+static LLVM_ATTRIBUTE_UNUSED std::string
+computePrefix(const char *DebugType, const char *File, int Line, int Level) {
+  std::string Prefix;
+  raw_string_ostream OsPrefix(Prefix);
+  if (DebugType)
+    OsPrefix << "[" << DebugType << ":" << Level << "] ";
+  OsPrefix << File << ":" << Line << " ";
+  return OsPrefix.str();
+}
 } // end namespace impl
 #else
 // As others in Debug, When compiling without assertions, the -debug-* options
 // and all inputs too LDBG() are ignored.
-#define LDBG()                                                                 \
+#define LDBG(...)                                                              \
   for (bool _c = false; _c; _c = false)                                        \
   ::llvm::nulls()
 #endif
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index 9272760..c26681c 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -149,10 +149,6 @@ public:
   /// number of threads!
   unsigned getMaxConcurrency() const override { return MaxThreadCount; }
 
-  // TODO: Remove, misleading legacy name warning!
-  LLVM_DEPRECATED("Use getMaxConcurrency instead", "getMaxConcurrency")
-  unsigned getThreadCount() const { return MaxThreadCount; }
-
   /// Returns true if the current thread is a worker thread of this thread pool.
   bool isWorkerThread() const;
 
@@ -233,10 +229,6 @@ public:
   /// Returns always 1: there is no concurrency.
   unsigned getMaxConcurrency() const override { return 1; }
 
-  // TODO: Remove, misleading legacy name warning!
-  LLVM_DEPRECATED("Use getMaxConcurrency instead", "getMaxConcurrency")
-  unsigned getThreadCount() const { return 1; }
-
   /// Returns true if the current thread is a worker thread of this thread pool.
   bool isWorkerThread() const;
 
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index a2b86eb..9d67d8b 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -841,6 +841,7 @@ public:
     SIZE,
     EMPTY,
     GETDAGOP,
+    GETDAGOPNAME,
     LOG2,
     REPR,
     LISTFLATTEN,
@@ -910,6 +911,7 @@ public:
     GETDAGARG,
     GETDAGNAME,
     SETDAGOP,
+    SETDAGOPNAME
   };
 
 private:
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index f420798..db90f2e 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -133,10 +133,11 @@ public:
         EmitStackSizeSection(false), EnableMachineOutliner(false),
         EnableMachineFunctionSplitter(false),
         EnableStaticDataPartitioning(false), SupportsDefaultOutlining(false),
-        EmitAddrsig(false), BBAddrMap(false), EmitCallSiteInfo(false),
-        SupportsDebugEntryValues(false), EnableDebugEntryValues(false),
-        ValueTrackingVariableLocations(false), ForceDwarfFrameSection(false),
-        XRayFunctionIndex(true), DebugStrictDwarf(false), Hotpatch(false),
+        EmitAddrsig(false), BBAddrMap(false), EmitCallGraphSection(false),
+        EmitCallSiteInfo(false), SupportsDebugEntryValues(false),
+        EnableDebugEntryValues(false), ValueTrackingVariableLocations(false),
+        ForceDwarfFrameSection(false), XRayFunctionIndex(true),
+        DebugStrictDwarf(false), Hotpatch(false),
         PPCGenScalarMASSEntries(false), JMCInstrument(false),
         EnableCFIFixup(false), MisExpect(false), XCOFFReadOnlyPointers(false),
         VerifyArgABICompliance(true),
@@ -319,6 +320,9 @@ public:
   /// to selectively generate basic block sections.
   std::shared_ptr<MemoryBuffer> BBSectionsFuncListBuf;
 
+  /// Emit section containing call graph metadata.
+  unsigned EmitCallGraphSection : 1;
+
   /// The flag enables call site info production. It is used only for debug
   /// info, and it is restricted only to optimized code. This can be used for
   /// something else, so that should be controlled in the frontend.
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 8ec8697..a4ed62b 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1147,6 +1147,9 @@ def fadd_contract : PatFrag<(ops node:$a, node:$b), (fadd node:$a, node:$b),[{
   return N->getFlags().hasAllowContract();
 }]>;
 
+def fsub_contract : PatFrag<(ops node:$a, node:$b), (fsub node:$a, node:$b),[{
+  return N->getFlags().hasAllowContract();
+}]>;
 
 def not  : PatFrag<(ops node:$in), (xor node:$in, -1)>;
 def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>;
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 670a632..ede9797 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -199,7 +199,8 @@ public:
     SUSE,
     OpenEmbedded,
     Intel,
-    LastVendorType = Intel
+    Meta,
+    LastVendorType = Meta
   };
   enum OSType {
     UnknownOS,
@@ -307,8 +308,8 @@ public:
     Mlibc,
 
     PAuthTest,
-
-    LastEnvironmentType = PAuthTest
+    MTIA,
+    LastEnvironmentType = MTIA
   };
   enum ObjectFormatType {
     UnknownObjectFormat,
diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h
index cd30663..a04cb35 100644
--- a/llvm/include/llvm/TextAPI/SymbolSet.h
+++ b/llvm/include/llvm/TextAPI/SymbolSet.h
@@ -92,6 +92,7 @@ private:
 
 public:
   SymbolSet() = default;
+  ~SymbolSet();
   LLVM_ABI Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags,
                              const Target &Targ);
   size_t size() const { return Symbols.size(); }
diff --git a/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h
index 20850ba..a9a370b 100644
--- a/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h
+++ b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h
@@ -41,6 +41,13 @@ public:
   static bool isRequired() { return true; }
 };
 
+class HipStdParMathFixupPass : public PassInfoMixin<HipStdParMathFixupPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+  static bool isRequired() { return true; }
+};
+
 } // namespace llvm
 
 #endif // LLVM_TRANSFORMS_HIPSTDPAR_HIPSTDPAR_H
diff --git a/llvm/include/llvm/Transforms/ObjCARC.h b/llvm/include/llvm/Transforms/ObjCARC.h
index c927513..c4b4c4f 100644
--- a/llvm/include/llvm/Transforms/ObjCARC.h
+++ b/llvm/include/llvm/Transforms/ObjCARC.h
@@ -35,10 +35,6 @@ struct ObjCARCContractPass : public PassInfoMixin<ObjCARCContractPass> {
   LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-struct ObjCARCAPElimPass : public PassInfoMixin<ObjCARCAPElimPass> {
-  LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
 struct ObjCARCExpandPass : public PassInfoMixin<ObjCARCExpandPass> {
   LLVM_ABI PreservedAnalyses run(Function &M, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap
index a4aae08..ac360b2 100644
--- a/llvm/include/module.modulemap
+++ b/llvm/include/module.modulemap
@@ -78,6 +78,7 @@ module LLVM_BinaryFormat {
     textual header "llvm/BinaryFormat/DynamicTags.def"
     textual header "llvm/BinaryFormat/MachO.def"
     textual header "llvm/BinaryFormat/MinidumpConstants.def"
+    textual header "llvm/BinaryFormat/SFrameConstants.def"
     textual header "llvm/BinaryFormat/Swift.def"
     textual header "llvm/BinaryFormat/ELFRelocs/AArch64.def"
     textual header "llvm/BinaryFormat/ELFRelocs/AMDGPU.def"
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index ec78386..759c553 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -929,12 +929,11 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
     if (!AllConstantInt)
       break;
 
-    // TODO: Try to intersect two inrange attributes?
-    if (!InRange) {
-      InRange = GEP->getInRange();
-      if (InRange)
-        // Adjust inrange by offset until now.
-        InRange = InRange->sextOrTrunc(BitWidth).subtract(Offset);
+    // Adjust inrange offset and intersect inrange attributes
+    if (auto GEPRange = GEP->getInRange()) {
+      auto AdjustedGEPRange = GEPRange->sextOrTrunc(BitWidth).subtract(Offset);
+      InRange =
+          InRange ? InRange->intersectWith(AdjustedGEPRange) : AdjustedGEPRange;
     }
 
     Ptr = cast<Constant>(GEP->getOperand(0));
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 2da6468..1959ab6 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -1079,15 +1079,16 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) {
       // add new space
       S = &BS->Spaces.emplace_back(B.Space);
 
-    // the space is full - set flag to report overlapping binding later
-    if (S->FreeRanges.empty()) {
+    // The space is full - there are no free slots left, or the rest of the
+    // slots are taken by an unbounded array. Set flag to report overlapping
+    // binding later.
+    if (S->FreeRanges.empty() || S->FreeRanges.back().UpperBound < UINT32_MAX) {
       OverlappingBinding = true;
       continue;
     }
 
     // adjust the last free range lower bound, split it in two, or remove it
     BindingRange &LastFreeRange = S->FreeRanges.back();
-    assert(LastFreeRange.UpperBound == UINT32_MAX);
     if (LastFreeRange.LowerBound == B.LowerBound) {
       if (B.UpperBound < UINT32_MAX)
         LastFreeRange.LowerBound = B.UpperBound + 1;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index dd9a44b..f1473b2 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3383,6 +3383,10 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
                                     SrcSubscripts, DstSubscripts))
     return false;
 
+  assert(isLoopInvariant(SrcBase, SrcLoop) &&
+         isLoopInvariant(DstBase, DstLoop) &&
+         "Expected SrcBase and DstBase to be loop invariant");
+
   int Size = SrcSubscripts.size();
   LLVM_DEBUG({
     dbgs() << "\nSrcSubscripts: ";
@@ -3666,6 +3670,19 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
                                         SCEVUnionPredicate(Assume, *SE));
   }
 
+  // Even if the base pointers are the same, they may not be loop-invariant. It
+  // could lead to incorrect results, as we're analyzing loop-carried
+  // dependencies. Src and Dst can be in different loops, so we need to check
+  // the base pointer is invariant in both loops.
+  Loop *SrcLoop = LI->getLoopFor(Src->getParent());
+  Loop *DstLoop = LI->getLoopFor(Dst->getParent());
+  if (!isLoopInvariant(SrcBase, SrcLoop) ||
+      !isLoopInvariant(DstBase, DstLoop)) {
+    LLVM_DEBUG(dbgs() << "The base pointer is not loop invariant.\n");
+    return std::make_unique<Dependence>(Src, Dst,
+                                        SCEVUnionPredicate(Assume, *SE));
+  }
+
   uint64_t EltSize = SrcLoc.Size.toRaw();
   const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase);
   const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase);
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 82530e7..5907e21 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5366,7 +5366,7 @@ static Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
     Type *MidTy = CI->getType();
     Type *DstTy = Ty;
     if (Src->getType() == Ty) {
-      auto FirstOp = static_cast<Instruction::CastOps>(CI->getOpcode());
+      auto FirstOp = CI->getOpcode();
       auto SecondOp = static_cast<Instruction::CastOps>(CastOpc);
       Type *SrcIntPtrTy =
           SrcTy->isPtrOrPtrVectorTy() ? Q.DL.getIntPtrType(SrcTy) : nullptr;
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 3aa9909..2b0f212 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -983,33 +983,37 @@ MemDepResult MemoryDependenceResults::getNonLocalInfoForBlock(
 static void
 SortNonLocalDepInfoCache(MemoryDependenceResults::NonLocalDepInfo &Cache,
                          unsigned NumSortedEntries) {
-  switch (Cache.size() - NumSortedEntries) {
-  case 0:
-    // done, no new entries.
-    break;
-  case 2: {
-    // Two new entries, insert the last one into place.
-    NonLocalDepEntry Val = Cache.back();
-    Cache.pop_back();
-    MemoryDependenceResults::NonLocalDepInfo::iterator Entry =
-        std::upper_bound(Cache.begin(), Cache.end() - 1, Val);
-    Cache.insert(Entry, Val);
-    [[fallthrough]];
+
+  // If only one entry, don't sort.
+  if (Cache.size() < 2)
+    return;
+
+  unsigned s = Cache.size() - NumSortedEntries;
+
+  // If the cache is already sorted, don't sort it again.
+  if (s == 0)
+    return;
+
+  // If no entry is sorted, sort the whole cache.
+  if (NumSortedEntries == 0) {
+    llvm::sort(Cache);
+    return;
   }
-  case 1:
-    // One new entry, Just insert the new value at the appropriate position.
-    if (Cache.size() != 1) {
+
+  // If the number of unsorted entires is small and the cache size is big, using
+  // insertion sort is faster. Here use Log2_32 to quickly choose the sort
+  // method.
+  if (s < Log2_32(Cache.size())) {
+    while (s > 0) {
       NonLocalDepEntry Val = Cache.back();
       Cache.pop_back();
       MemoryDependenceResults::NonLocalDepInfo::iterator Entry =
-          llvm::upper_bound(Cache, Val);
+          std::upper_bound(Cache.begin(), Cache.end() - s + 1, Val);
       Cache.insert(Entry, Val);
+      s--;
     }
-    break;
-  default:
-    // Added many values, do a full scale sort.
+  } else {
     llvm::sort(Cache);
-    break;
   }
 }
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index e475be2..6e92766 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -875,6 +875,34 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_toascii);
   }
 
+  if (T.isOSFreeBSD()) {
+    TLI.setAvailable(LibFunc_dunder_strtok_r);
+    TLI.setAvailable(LibFunc_memalign);
+    TLI.setAvailable(LibFunc_fputc_unlocked);
+    TLI.setAvailable(LibFunc_fputs_unlocked);
+    TLI.setAvailable(LibFunc_fread_unlocked);
+    TLI.setAvailable(LibFunc_fwrite_unlocked);
+    TLI.setAvailable(LibFunc_getc_unlocked);
+    TLI.setAvailable(LibFunc_getchar_unlocked);
+    TLI.setAvailable(LibFunc_putc_unlocked);
+    TLI.setAvailable(LibFunc_putchar_unlocked);
+
+    TLI.setUnavailable(LibFunc___kmpc_alloc_shared);
+    TLI.setUnavailable(LibFunc___kmpc_free_shared);
+    TLI.setUnavailable(LibFunc_dunder_strndup);
+    TLI.setUnavailable(LibFunc_memccpy_chk);
+    TLI.setUnavailable(LibFunc_strlen_chk);
+    TLI.setUnavailable(LibFunc_fmaximum_num);
+    TLI.setUnavailable(LibFunc_fmaximum_numf);
+    TLI.setUnavailable(LibFunc_fmaximum_numl);
+    TLI.setUnavailable(LibFunc_fminimum_num);
+    TLI.setUnavailable(LibFunc_fminimum_numf);
+    TLI.setUnavailable(LibFunc_fminimum_numl);
+    TLI.setUnavailable(LibFunc_roundeven);
+    TLI.setUnavailable(LibFunc_roundevenf);
+    TLI.setUnavailable(LibFunc_roundevenl);
+  }
+
   // As currently implemented in clang, NVPTX code has no standard library to
   // speak of.  Headers provide a standard-ish library implementation, but many
   // of the signatures are wrong -- for example, many libm functions are not
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index c871070..7025b83 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const {
   Result.TBAAStruct = nullptr;
   Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
   Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+  Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+      NoAliasAddrSpace, Other.NoAliasAddrSpace);
   return Result;
 }
 
@@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const {
   Result.TBAA = Result.TBAAStruct = nullptr;
   Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope);
   Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias);
+  Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace(
+      NoAliasAddrSpace, Other.NoAliasAddrSpace);
   return Result;
 }
 
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 1b3da59..150ddce 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -240,30 +240,6 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
   return Intrinsic::not_intrinsic;
 }
 
-struct InterleaveIntrinsic {
-  Intrinsic::ID Interleave, Deinterleave;
-};
-
-static InterleaveIntrinsic InterleaveIntrinsics[] = {
-    {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
-    {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
-    {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
-    {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
-    {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
-    {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
-    {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
-};
-
-Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
-  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
-  return InterleaveIntrinsics[Factor - 2].Interleave;
-}
-
-Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
-  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
-  return InterleaveIntrinsics[Factor - 2].Deinterleave;
-}
-
 unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) {
   switch (ID) {
   case Intrinsic::vector_interleave2:
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 5d7c97a..6356d71 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -37,8 +37,8 @@ void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA,
   //   unsigned long personality;  /* Pointer to the personality routine */
   //   }
 
-  auto *EHInfo =
-      cast<MCSectionXCOFF>(Asm->getObjFileLowering().getCompactUnwindSection());
+  auto *EHInfo = static_cast<MCSectionXCOFF *>(
+      Asm->getObjFileLowering().getCompactUnwindSection());
   if (Asm->TM.getFunctionSections()) {
     // If option -ffunction-sections is on, append the function name to the
     // name of EH Info Table csect so that each function has its own EH Info
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f1d3e96..6166271 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -4221,10 +4221,11 @@ MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
       SectionKind Kind = CPE.getSectionKind(&DL);
       const Constant *C = CPE.Val.ConstVal;
       Align Alignment = CPE.Alignment;
-      if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
-              getObjFileLowering().getSectionForConstant(DL, Kind, C,
-                                                         Alignment))) {
-        if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+      auto *S =
+          getObjFileLowering().getSectionForConstant(DL, Kind, C, Alignment);
+      if (S && TM.getTargetTriple().isOSBinFormatCOFF()) {
+        if (MCSymbol *Sym =
+                static_cast<const MCSectionCOFF *>(S)->getCOMDATSymbol()) {
           if (Sym->isUndefined())
             OutStreamer->emitSymbolAttribute(Sym, MCSA_Global);
           return Sym;
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 8abeb56..c5d6e40 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1051,10 +1051,10 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
   // comdat key. A section may be comdat because of -ffunction-sections or
   // because it is comdat in the IR.
   MCSectionCOFF *GVSec =
-      GVSym ? dyn_cast<MCSectionCOFF>(&GVSym->getSection()) : nullptr;
+      GVSym ? static_cast<MCSectionCOFF *>(&GVSym->getSection()) : nullptr;
   const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr;
 
-  MCSectionCOFF *DebugSec = cast<MCSectionCOFF>(
+  auto *DebugSec = static_cast<MCSectionCOFF *>(
       CompilerInfoAsm->getObjFileLowering().getCOFFDebugSymbolsSection());
   DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 11b8576..7188833 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -972,10 +972,9 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
       // the call graph which could lead to some target function. For tail
       // calls, no return PC information is needed, unless tuning for GDB in
       // DWARF4 mode in which case we fake a return PC for compatibility.
-      const MCSymbol *PCAddr =
-          (!IsTail || CU.useGNUAnalogForDwarf5Feature())
-              ? const_cast<MCSymbol *>(getLabelAfterInsn(TopLevelCallMI))
-              : nullptr;
+      const MCSymbol *PCAddr = (!IsTail || CU.useGNUAnalogForDwarf5Feature())
+                                   ? getLabelAfterInsn(TopLevelCallMI)
+                                   : nullptr;
 
       // For tail calls, it's necessary to record the address of the branch
       // instruction so that the debugger can show where the tail call occurred.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 3b3e7a4..dcfd9aa 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -2083,22 +2083,55 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   if (TBB == FBB) {
     MBB->splice(Loc, TBB, TBB->begin(), TIB);
   } else {
+    // Merge the debug locations, and hoist and kill the debug instructions from
+    // both branches. FIXME: We could probably try harder to preserve some debug
+    // instructions (but at least this isn't producing wrong locations).
+    MachineInstrBuilder MIRBuilder(*MBB->getParent(), Loc);
+    auto HoistAndKillDbgInstr = [MBB, Loc](MachineBasicBlock::iterator DI) {
+      assert(DI->isDebugInstr() && "Expected a debug instruction");
+      if (DI->isDebugRef()) {
+        const TargetInstrInfo *TII =
+            MBB->getParent()->getSubtarget().getInstrInfo();
+        const MCInstrDesc &DBGV = TII->get(TargetOpcode::DBG_VALUE);
+        DI = BuildMI(*MBB->getParent(), DI->getDebugLoc(), DBGV, false, 0,
+                     DI->getDebugVariable(), DI->getDebugExpression());
+        MBB->insert(Loc, &*DI);
+        return;
+      }
+      // Deleting a DBG_PHI results in an undef at the referenced DBG_INSTR_REF.
+      if (DI->isDebugPHI()) {
+        DI->eraseFromParent();
+        return;
+      }
+      // Move DBG_LABELs without modifying them. Set DBG_VALUEs undef.
+      if (!DI->isDebugLabel())
+        DI->setDebugValueUndef();
+      DI->moveBefore(&*Loc);
+    };
+
     // TIB and FIB point to the end of the regions to hoist/merge in TBB and
     // FBB.
     MachineBasicBlock::iterator FE = FIB;
     MachineBasicBlock::iterator FI = FBB->begin();
     for (MachineBasicBlock::iterator TI :
          make_early_inc_range(make_range(TBB->begin(), TIB))) {
-      // Move debug instructions and pseudo probes without modifying them.
-      // FIXME: This is the wrong thing to do for debug locations, which
-      // should at least be killed (and hoisted from BOTH blocks).
-      if (TI->isDebugOrPseudoInstr()) {
-        TI->moveBefore(&*Loc);
+      // Hoist and kill debug instructions from FBB. After this loop FI points
+      // to the next non-debug instruction to hoist (checked in assert after the
+      // TBB debug instruction handling code).
+      while (FI != FE && FI->isDebugInstr())
+        HoistAndKillDbgInstr(FI++);
+
+      // Kill debug instructions before moving.
+      if (TI->isDebugInstr()) {
+        HoistAndKillDbgInstr(TI);
         continue;
       }
 
-      // Get the next non-meta instruction in FBB.
-      FI = skipDebugInstructionsForward(FI, FE, false);
+      // FI and TI now point to identical non-debug instructions.
+      assert(FI != FE && "Unexpected end of FBB range");
+      // Pseudo probes are excluded from the range when identifying foldable
+      // instructions, so we don't expect to see one now.
+      assert(!TI->isPseudoProbe() && "Unexpected pseudo probe in range");
       // NOTE: The loop above checks CheckKillDead but we can't do that here as
       // it modifies some kill markers after the check.
       assert(TI->isIdenticalTo(*FI, MachineInstr::CheckDefs) &&
@@ -2111,6 +2144,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       ++FI;
     }
   }
+
   FBB->erase(FBB->begin(), FIB);
 
   if (UpdateLiveIns)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index c21058c..416c56d 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2095,6 +2095,10 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
   if (!L->isLoopInvariant(RemAmt))
     return false;
 
+  // Only works if the AddOffset is a loop invaraint
+  if (AddOffset && !L->isLoopInvariant(AddOffset))
+    return false;
+
   // Is the PHI a loop increment?
   auto LoopIncrInfo = getIVIncrement(PN, LI);
   if (!LoopIncrInfo)
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 9512f79..810dc29 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -101,6 +101,7 @@ CGOPT(EABI, EABIVersion)
 CGOPT(DebuggerKind, DebuggerTuningOpt)
 CGOPT(bool, EnableStackSizeSection)
 CGOPT(bool, EnableAddrsig)
+CGOPT(bool, EnableCallGraphSection)
 CGOPT(bool, EmitCallSiteInfo)
 CGOPT(bool, EnableMachineFunctionSplitter)
 CGOPT(bool, EnableStaticDataPartitioning)
@@ -461,6 +462,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(EnableAddrsig);
 
+  static cl::opt<bool> EnableCallGraphSection(
+      "call-graph-section", cl::desc("Emit a call graph section"),
+      cl::init(false));
+  CGBINDOPT(EnableCallGraphSection);
+
   static cl::opt<bool> EmitCallSiteInfo(
       "emit-call-site-info",
       cl::desc(
@@ -595,6 +601,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter();
   Options.EnableStaticDataPartitioning = getEnableStaticDataPartitioning();
   Options.EmitAddrsig = getEnableAddrsig();
+  Options.EmitCallGraphSection = getEnableCallGraphSection();
   Options.EmitCallSiteInfo = getEmitCallSiteInfo();
   Options.EnableDebugEntryValues = getEnableDebugEntryValues();
   Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 8855740f..9b2851e 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -2186,19 +2186,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     llvm_unreachable("Deinterleave node should already have ReplacementNode");
     break;
   case ComplexDeinterleavingOperation::Splat: {
-    auto *NewTy = VectorType::getDoubleElementsVectorType(
-        cast<VectorType>(Node->Real->getType()));
     auto *R = dyn_cast<Instruction>(Node->Real);
     auto *I = dyn_cast<Instruction>(Node->Imag);
     if (R && I) {
       // Splats that are not constant are interleaved where they are located
       Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
       IRBuilder<> IRB(InsertPoint);
-      ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2,
-                                            NewTy, {Node->Real, Node->Imag});
+      ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag});
     } else {
-      ReplacementNode = Builder.CreateIntrinsic(
-          Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag});
+      ReplacementNode =
+          Builder.CreateVectorInterleave({Node->Real, Node->Imag});
     }
     break;
   }
@@ -2226,10 +2223,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
     auto *A = replaceNode(Builder, Node->Operands[0]);
     auto *B = replaceNode(Builder, Node->Operands[1]);
-    auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
-        cast<VectorType>(MaskReal->getType()));
-    auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2,
-                                            NewMaskTy, {MaskReal, MaskImag});
+    auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag});
     ReplacementNode = Builder.CreateSelect(NewMask, A, B);
     break;
   }
@@ -2260,8 +2254,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
   }
 
   if (!NewInit)
-    NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
-                                      {Init, Constant::getNullValue(VTy)});
+    NewInit =
+        Builder.CreateVectorInterleave({Init, Constant::getNullValue(VTy)});
 
   NewPHI->addIncoming(NewInit, Incoming);
   NewPHI->addIncoming(OperationReplacement, BackEdge);
@@ -2281,16 +2275,12 @@ void ComplexDeinterleavingGraph::processReductionOperation(
   auto *OldPHIImag = ReductionInfo[Imag].first;
   auto *NewPHI = OldToNewPHI[OldPHIReal];
 
-  auto *VTy = cast<VectorType>(Real->getType());
-  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
-
   // We have to interleave initial origin values coming from IncomingBlock
   Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming);
   Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
 
   IRBuilder<> Builder(Incoming->getTerminator());
-  auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
-                                          {InitReal, InitImag});
+  auto *NewInit = Builder.CreateVectorInterleave({InitReal, InitImag});
 
   NewPHI->addIncoming(NewInit, Incoming);
   NewPHI->addIncoming(OperationReplacement, BackEdge);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e8f513a..e84ba91 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
   const TargetOptions &Options = MF->getTarget().Options;
   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
 
-  if (CanReassociate &&
-      !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc)))
+  if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc))
     return false;
 
   // Floating-point multiply-add with intermediate rounding.
@@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
   if (!HasFMAD && !HasFMA)
     return false;
 
-  AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                        Options.UnsafeFPMath || HasFMAD;
+  AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
   // If the addition is not contractable, do not combine.
   if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
     return false;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed7b07f..538a763 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
     return UnableToLegalize;
 
-  if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
+  if (MI.getFlag(MachineInstr::FmAfn)) {
     unsigned Flags = MI.getFlags();
     auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
     MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index c2839d4..5e50898 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -634,6 +634,9 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
                       << " and factor = " << Factor << "\n");
   } else {
     assert(II);
+    if (II->getIntrinsicID() != Intrinsic::masked_load &&
+        II->getIntrinsicID() != Intrinsic::vp_load)
+      return false;
 
     // Check mask operand. Handle both all-true/false and interleaved mask.
     Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
@@ -673,6 +676,9 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
   Value *Mask = nullptr;
   if (II) {
+    if (II->getIntrinsicID() != Intrinsic::masked_store &&
+        II->getIntrinsicID() != Intrinsic::vp_store)
+      return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
     Mask = getMask(getMaskOperand(II), Factor,
                    cast<VectorType>(InterleaveValues[0]->getType()));
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 7153902..193df1f 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -616,6 +616,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
       .Case("!range", MIToken::md_range)
       .Case("!DIExpression", MIToken::md_diexpr)
       .Case("!DILocation", MIToken::md_dilocation)
+      .Case("!noalias.addrspace", MIToken::md_noalias_addrspace)
       .Default(MIToken::Error);
 }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index d7cd067..54142ac 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -151,6 +151,7 @@ struct MIToken {
     md_tbaa,
     md_alias_scope,
     md_noalias,
+    md_noalias_addrspace,
     md_range,
     md_diexpr,
     md_dilocation,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 3a364d5..807d59c 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -3482,6 +3482,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
       if (parseMDNode(AAInfo.NoAlias))
         return true;
       break;
+    case MIToken::md_noalias_addrspace:
+      lex();
+      if (parseMDNode(AAInfo.NoAliasAddrSpace))
+        return true;
+      break;
     case MIToken::md_range:
       lex();
       if (parseMDNode(Range))
@@ -3490,7 +3495,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     // TODO: Report an error on duplicate metadata nodes.
     default:
       return error("expected 'align' or '!tbaa' or '!alias.scope' or "
-                   "'!noalias' or '!range'");
+                   "'!noalias' or '!range' or '!noalias.addrspace'");
     }
   }
   if (expectAndConsume(MIToken::rparen))
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 1e9fcf3..3e99e57 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -504,13 +504,21 @@ bool MIRParserImpl::initializeCallSiteInfo(
         return error(Error, ArgRegPair.Reg.SourceRange);
       CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo);
     }
+    if (!YamlCSInfo.CalleeTypeIds.empty()) {
+      for (auto CalleeTypeId : YamlCSInfo.CalleeTypeIds) {
+        IntegerType *Int64Ty = Type::getInt64Ty(Context);
+        CSInfo.CalleeTypeIds.push_back(ConstantInt::get(Int64Ty, CalleeTypeId,
+                                                        /*isSigned=*/false));
+      }
+    }
 
-    if (TM.Options.EmitCallSiteInfo)
+    if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)
       MF.addCallSiteInfo(&*CallI, std::move(CSInfo));
   }
 
-  if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo)
-    return error(Twine("Call site info provided but not used"));
+  if (!YamlMF.CallSitesInfo.empty() &&
+      !(TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection))
+    return error("call site info provided but not used");
   return false;
 }
 
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bc4e299..ad7835a 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -525,24 +525,30 @@ static void convertCallSiteObjects(yaml::MachineFunction &YMF,
                                    const MachineFunction &MF,
                                    ModuleSlotTracker &MST) {
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
-  for (auto CSInfo : MF.getCallSitesInfo()) {
+  for (auto [MI, CallSiteInfo] : MF.getCallSitesInfo()) {
     yaml::CallSiteInfo YmlCS;
     yaml::MachineInstrLoc CallLocation;
 
     // Prepare instruction position.
-    MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
+    MachineBasicBlock::const_instr_iterator CallI = MI->getIterator();
     CallLocation.BlockNum = CallI->getParent()->getNumber();
     // Get call instruction offset from the beginning of block.
     CallLocation.Offset =
         std::distance(CallI->getParent()->instr_begin(), CallI);
     YmlCS.CallLocation = CallLocation;
+
+    auto [ArgRegPairs, CalleeTypeIds] = CallSiteInfo;
     // Construct call arguments and theirs forwarding register info.
-    for (auto ArgReg : CSInfo.second.ArgRegPairs) {
+    for (auto ArgReg : ArgRegPairs) {
       yaml::CallSiteInfo::ArgRegPair YmlArgReg;
       YmlArgReg.ArgNo = ArgReg.ArgNo;
       printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI);
       YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg);
     }
+    // Get type ids.
+    for (auto *CalleeTypeId : CalleeTypeIds) {
+      YmlCS.CalleeTypeIds.push_back(CalleeTypeId->getZExtValue());
+    }
     YMF.CallSitesInfo.push_back(std::move(YmlCS));
   }
 
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 429a17a..60d42e0 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -211,8 +211,7 @@ void MachineFunction::init() {
   ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
   Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
-  // FIXME: Use Function::hasOptSize().
-  if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize))
+  if (!F.getAlign() && !F.hasOptSize())
     Alignment = std::max(Alignment,
                          STI->getTargetLowering()->getPrefFunctionAlignment());
 
@@ -920,7 +919,7 @@ MachineFunction::getCallSiteInfo(const MachineInstr *MI) {
   assert(MI->isCandidateForAdditionalCallInfo() &&
          "Call site info refers only to call (MI) candidates");
 
-  if (!Target.Options.EmitCallSiteInfo)
+  if (!Target.Options.EmitCallSiteInfo && !Target.Options.EmitCallGraphSection)
     return CallSitesInfo.end();
   return CallSitesInfo.find(MI);
 }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 0d25169..c612f8de 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << ", !noalias ";
     AAInfo.NoAlias->printAsOperand(OS, MST);
   }
+  if (AAInfo.NoAliasAddrSpace) {
+    OS << ", !noalias.addrspace ";
+    AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST);
+  }
   if (getRanges()) {
     OS << ", !range ";
     getRanges()->printAsOperand(OS, MST);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d3df434..251682a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/ByteProvider.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/SDPatternMatch.h"
@@ -22778,8 +22779,10 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
       const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
       // If we store purely within object bounds just before its lifetime ends,
       // we can remove the store.
-      if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
-                                   StoreSize.getFixedValue() * 8)) {
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      if (LifetimeEndBase.contains(
+              DAG, MFI.getObjectSize(LifetimeEnd->getFrameIndex()) * 8,
+              StoreBase, StoreSize.getFixedValue() * 8)) {
         LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
                    dbgs() << "\nwithin LIFETIME_END of : ";
                    LifetimeEndBase.dump(); dbgs() << "\n");
@@ -29415,7 +29418,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     MachineMemOperand *MMO;
   };
 
-  auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
+  auto getCharacteristics = [this](SDNode *N) -> MemUseCharacteristics {
     if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
       int64_t Offset = 0;
       if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
@@ -29428,13 +29431,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
               LSN->getBasePtr(),           Offset /*base offset*/,
               LocationSize::precise(Size), LSN->getMemOperand()};
     }
-    if (const auto *LN = cast<LifetimeSDNode>(N))
+    if (const auto *LN = cast<LifetimeSDNode>(N)) {
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
       return {false /*isVolatile*/,
               /*isAtomic*/ false,
               LN->getOperand(1),
               0,
-              LocationSize::precise(LN->getSize()),
+              LocationSize::precise(MFI.getObjectSize(LN->getFrameIndex())),
               (MachineMemOperand *)nullptr};
+    }
     // Default.
     return {false /*isvolatile*/,
             /*isAtomic*/ false,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 74172b2..ba0ab23 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   case ISD::FP_TO_FP16:
     LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
-    if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
+    if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) {
       SDValue Op = Node->getOperand(0);
       MVT SVT = Op.getSimpleValueType();
       if ((SVT == MVT::f64 || SVT == MVT::f80) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e5704c0..583a85a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -357,6 +358,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::PATCHPOINT:
     Res = PromoteIntRes_PATCHPOINT(N);
     break;
+  case ISD::READ_REGISTER:
+    Res = PromoteIntRes_READ_REGISTER(N);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -2076,6 +2080,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::PATCHPOINT:
     Res = PromoteIntOp_PATCHPOINT(N, OpNo);
     break;
+  case ISD::WRITE_REGISTER:
+    Res = PromoteIntOp_WRITE_REGISTER(N, OpNo);
+    break;
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
   case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
     Res = PromoteIntOp_VP_STRIDED(N, OpNo);
@@ -2853,6 +2860,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_WRITE_REGISTER(SDNode *N,
+                                                      unsigned OpNo) {
+  const Function &Fn = DAG.getMachineFunction().getFunction();
+  Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+      "cannot use llvm.write_register with illegal type", Fn,
+      N->getDebugLoc()));
+  return N->getOperand(0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
   assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) ||
          (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4));
@@ -3127,6 +3143,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VSCALE:
     ExpandIntRes_VSCALE(N, Lo, Hi);
     break;
+
+  case ISD::READ_REGISTER:
+    ExpandIntRes_READ_REGISTER(N, Lo, Hi);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -5471,6 +5491,18 @@ void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,
   SplitInteger(Res, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  const Function &Fn = DAG.getMachineFunction().getFunction();
+  Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+      "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc()));
+  ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  Lo = DAG.getPOISON(LoVT);
+  Hi = DAG.getPOISON(HiVT);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Operand Expansion
 //===----------------------------------------------------------------------===//
@@ -5537,6 +5569,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
     Res = ExpandIntOp_VP_STRIDED(N, OpNo);
     break;
+  case ISD::WRITE_REGISTER:
+    Res = ExpandIntOp_WRITE_REGISTER(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -5935,6 +5970,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo) {
+  const Function &Fn = DAG.getMachineFunction().getFunction();
+  Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+      "cannot use llvm.write_register with illegal type", Fn,
+      N->getDebugLoc()));
+
+  return N->getOperand(0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) {
   SDLoc dl(N);
 
@@ -6332,6 +6376,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_PATCHPOINT(SDNode *N) {
   return Res.getValue(0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_READ_REGISTER(SDNode *N) {
+  const Function &Fn = DAG.getMachineFunction().getFunction();
+  Fn.getContext().diagnose(DiagnosticInfoLegalizationFailure(
+      "cannot use llvm.read_register with illegal type", Fn, N->getDebugLoc()));
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+  return DAG.getPOISON(NVT);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9b53724..2e13b18 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -378,6 +378,7 @@ private:
   SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
   SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
   SDValue PromoteIntRes_PATCHPOINT(SDNode *N);
+  SDValue PromoteIntRes_READ_REGISTER(SDNode *N);
   SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
   SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N);
   SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N);
@@ -428,6 +429,7 @@ private:
   SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
   SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
@@ -511,6 +513,7 @@ private:
   void ExpandIntRes_FunnelShift       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_VSCALE            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
                              SDValue &Lo, SDValue &Hi);
@@ -534,6 +537,7 @@ private:
   SDValue ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo);
   SDValue ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
   SDValue ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_WRITE_REGISTER(SDNode *N, unsigned OpNo);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
                                   ISD::CondCode &CCCode, const SDLoc &dl);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 773ff48..5c586f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -784,10 +784,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   case ISD::TargetFrameIndex:
     ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
     break;
-  case ISD::LIFETIME_START:
-  case ISD::LIFETIME_END:
-    ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
-    break;
   case ISD::PSEUDO_PROBE:
     ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
     ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex());
@@ -9360,8 +9356,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
 }
 
 SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
-                                      SDValue Chain, int FrameIndex,
-                                      int64_t Size) {
+                                      SDValue Chain, int FrameIndex) {
   const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
   const auto VTs = getVTList(MVT::Other);
   SDValue Ops[2] = {
@@ -9373,13 +9368,12 @@ SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opcode, VTs, Ops);
   ID.AddInteger(FrameIndex);
-  ID.AddInteger(Size);
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
     return SDValue(E, 0);
 
-  LifetimeSDNode *N = newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(),
-                                                dl.getDebugLoc(), VTs, Size);
+  LifetimeSDNode *N =
+      newSDNode<LifetimeSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs);
   createOperands(N, Ops);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1636465..306e068 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) {
   // FPTrunc is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
   SDLoc dl = getCurSDLoc();
+  SDNodeFlags Flags;
+  if (auto *TruncInst = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*TruncInst);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
                            DAG.getTargetConstant(
-                               0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
+                               0, dl, TLI.getPointerTy(DAG.getDataLayout())),
+                           Flags));
 }
 
 void SelectionDAGBuilder::visitFPExt(const User &I) {
@@ -7594,8 +7598,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     if (TM.getOptLevel() == CodeGenOptLevel::None)
       return;
 
-    const int64_t ObjectSize =
-        cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
     const AllocaInst *LifetimeObject = cast<AllocaInst>(I.getArgOperand(1));
 
     // First check that the Alloca is static, otherwise it won't have a
@@ -7605,7 +7607,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       return;
 
     const int FrameIndex = SI->second;
-    Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize);
+    Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex);
     DAG.setRoot(Res);
     return;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 9474587..900da76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -946,8 +946,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
        << " -> "
        << ASC->getDestAddressSpace()
        << ']';
-  } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
-    OS << "<0 to " << LN->getSize() << ">";
   } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
     OS << '<' << AA->getAlign().value() << '>';
   }
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index d4a3455..68b8a00 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -806,7 +806,17 @@ void TargetLoweringBase::initActions() {
                         ISD::SDIVFIX,        ISD::SDIVFIXSAT,
                         ISD::UDIVFIX,        ISD::UDIVFIXSAT,
                         ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
-                        ISD::IS_FPCLASS},
+                        ISD::IS_FPCLASS,     ISD::FCBRT,
+                        ISD::FLOG,           ISD::FLOG2,
+                        ISD::FLOG10,         ISD::FEXP,
+                        ISD::FEXP2,          ISD::FEXP10,
+                        ISD::FFLOOR,         ISD::FNEARBYINT,
+                        ISD::FCEIL,          ISD::FRINT,
+                        ISD::FTRUNC,         ISD::FROUNDEVEN,
+                        ISD::FTAN,           ISD::FACOS,
+                        ISD::FASIN,          ISD::FATAN,
+                        ISD::FCOSH,          ISD::FSINH,
+                        ISD::FTANH,          ISD::FATAN2},
                        VT, Expand);
 
     // Overflow operations default to expand
@@ -852,13 +862,12 @@ void TargetLoweringBase::initActions() {
 
     // These operations default to expand for vector types.
     if (VT.isVector())
-      setOperationAction(
-          {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
-           ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
-           ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
-           ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
-           ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2},
-          VT, Expand);
+      setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG,
+                          ISD::ANY_EXTEND_VECTOR_INREG,
+                          ISD::SIGN_EXTEND_VECTOR_INREG,
+                          ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR,
+                          ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND},
+                         VT, Expand);
 
       // Constrained floating-point operations default to expand.
 #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
@@ -914,15 +923,6 @@ void TargetLoweringBase::initActions() {
                      {MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
                      Expand);
 
-  // These library functions default to expand.
-  setOperationAction({ISD::FCBRT,      ISD::FLOG,  ISD::FLOG2,  ISD::FLOG10,
-                      ISD::FEXP,       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
-                      ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT,  ISD::FTRUNC,
-                      ISD::FROUNDEVEN, ISD::FTAN,  ISD::FACOS,  ISD::FASIN,
-                      ISD::FATAN,      ISD::FCOSH, ISD::FSINH,  ISD::FTANH,
-                      ISD::FATAN2},
-                     {MVT::f32, MVT::f64, MVT::f128}, Expand);
-
   // Insert custom handling default for llvm.canonicalize.*.
   setOperationAction(ISD::FCANONICALIZE,
                      {MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand);
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index a40ceaa..725e951 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -995,7 +995,7 @@ MCSection *TargetLoweringObjectFileELF::getSectionForLSDA(
   if (!LSDASection || (!F.hasComdat() && !TM.getFunctionSections()))
     return LSDASection;
 
-  const auto *LSDA = cast<MCSectionELF>(LSDASection);
+  const auto *LSDA = static_cast<const MCSectionELF *>(LSDASection);
   unsigned Flags = LSDA->getFlags();
   const MCSymbolELF *LinkedToSym = nullptr;
   StringRef Group;
@@ -1734,7 +1734,8 @@ MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
       Name == getInstrProfSectionName(IPSK_covdata, Triple::COFF,
                                       /*AddSegmentInfo=*/false) ||
       Name == getInstrProfSectionName(IPSK_covname, Triple::COFF,
-                                      /*AddSegmentInfo=*/false))
+                                      /*AddSegmentInfo=*/false) ||
+      Name == ".llvmbc" || Name == ".llvmcmd")
     Kind = SectionKind::getMetadata();
   int Selection = 0;
   unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
@@ -2054,14 +2055,14 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
   return getCOFFStaticStructorSection(
       getContext(), getContext().getTargetTriple(), true, Priority, KeySym,
-      cast<MCSectionCOFF>(StaticCtorSection));
+      static_cast<MCSectionCOFF *>(StaticCtorSection));
 }
 
 MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
   return getCOFFStaticStructorSection(
       getContext(), getContext().getTargetTriple(), false, Priority, KeySym,
-      cast<MCSectionCOFF>(StaticDtorSection));
+      static_cast<MCSectionCOFF *>(StaticDtorSection));
 }
 
 const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
@@ -2388,23 +2389,25 @@ TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV,
   // here.
   if (const GlobalObject *GO = dyn_cast<GlobalObject>(GV)) {
     if (GO->isDeclarationForLinker())
-      return cast<MCSectionXCOFF>(getSectionForExternalReference(GO, TM))
+      return static_cast<const MCSectionXCOFF *>(
+                 getSectionForExternalReference(GO, TM))
           ->getQualNameSymbol();
 
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
       if (GVar->hasAttribute("toc-data"))
-        return cast<MCSectionXCOFF>(
+        return static_cast<const MCSectionXCOFF *>(
                    SectionForGlobal(GVar, SectionKind::getData(), TM))
             ->getQualNameSymbol();
 
     SectionKind GOKind = getKindForGlobal(GO, TM);
     if (GOKind.isText())
-      return cast<MCSectionXCOFF>(
+      return static_cast<const MCSectionXCOFF *>(
                  getSectionForFunctionDescriptor(cast<Function>(GO), TM))
           ->getQualNameSymbol();
     if ((TM.getDataSections() && !GO->hasSection()) || GO->hasCommonLinkage() ||
         GOKind.isBSSLocal() || GOKind.isThreadBSSLocal())
-      return cast<MCSectionXCOFF>(SectionForGlobal(GO, GOKind, TM))
+      return static_cast<const MCSectionXCOFF *>(
+                 SectionForGlobal(GO, GOKind, TM))
           ->getQualNameSymbol();
   }
 
@@ -2740,7 +2743,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA(
     const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const {
-  auto *LSDA = cast<MCSectionXCOFF>(LSDASection);
+  auto *LSDA = static_cast<MCSectionXCOFF *>(LSDASection);
   if (TM.getFunctionSections()) {
     // If option -ffunction-sections is on, append the function name to the
     // name of the LSDA csect so that each function has its own LSDA csect.
diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
index 6267207..fd54190 100644
--- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
+++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp
@@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable(
                          AddrOfOldGV, Twine("__ref_").concat(GV->getName()),
                          nullptr, GlobalVariable::NotThreadLocal);
 
+  // RefGV is created with isConstant = false, but we want to place RefGV into
+  // .rdata, not .data.  It is important that the GlobalVariable be mutable
+  // from the compiler's point of view, so that the optimizer does not remove
+  // the global variable entirely and replace all references to it with its
+  // initial value.
+  //
+  // When the Windows hot-patch loader applies a hot-patch, it maps the
+  // pages of .rdata as read/write so that it can set each __ref_* variable
+  // to point to the original variable in the base image. Afterward, pages in
+  // .rdata are remapped as read-only. This protects the __ref_* variables from
+  // being overwritten during execution.
+  RefGV->setSection(".rdata");
+
   // Create debug info for the replacement global variable.
   DataLayout Layout = M->getDataLayout();
   DIType *DebugType = DebugInfo.createPointerType(
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index b3798f1..a8559e7 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -183,7 +183,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   std::lock_guard<sys::Mutex> locked(lock);
 
   // Save information about our target
-  Arch = (Triple::ArchType)Obj.getArch();
+  Arch = Obj.getArch();
   IsTargetLittleEndian = Obj.isLittleEndian();
   setMipsABI(Obj);
 
@@ -1361,18 +1361,17 @@ std::unique_ptr<RuntimeDyld::LoadedObjectInfo>
 RuntimeDyld::loadObject(const ObjectFile &Obj) {
   if (!Dyld) {
     if (Obj.isELF())
-      Dyld =
-          createRuntimeDyldELF(static_cast<Triple::ArchType>(Obj.getArch()),
-                               MemMgr, Resolver, ProcessAllSections,
-                               std::move(NotifyStubEmitted));
+      Dyld = createRuntimeDyldELF(Obj.getArch(), MemMgr, Resolver,
+                                  ProcessAllSections,
+                                  std::move(NotifyStubEmitted));
     else if (Obj.isMachO())
-      Dyld = createRuntimeDyldMachO(
-               static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver,
-               ProcessAllSections, std::move(NotifyStubEmitted));
+      Dyld = createRuntimeDyldMachO(Obj.getArch(), MemMgr, Resolver,
+                                    ProcessAllSections,
+                                    std::move(NotifyStubEmitted));
     else if (Obj.isCOFF())
-      Dyld = createRuntimeDyldCOFF(
-               static_cast<Triple::ArchType>(Obj.getArch()), MemMgr, Resolver,
-               ProcessAllSections, std::move(NotifyStubEmitted));
+      Dyld = createRuntimeDyldCOFF(Obj.getArch(), MemMgr, Resolver,
+                                   ProcessAllSections,
+                                   std::move(NotifyStubEmitted));
     else
       report_fatal_error("Incompatible object format!");
   }
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 840ca83..7928772 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2617,7 +2617,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
 Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
     const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
     AttributeList FuncAttrs) {
-  IRBuilder<>::InsertPointGuard IPG(Builder);
+  InsertPointTy SavedIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
@@ -2630,7 +2630,6 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
   WcFunc->addParamAttr(1, Attribute::NoUndef);
   BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
   Builder.SetInsertPoint(EntryBB);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // ReduceList: thread local Reduce list.
   // At the stage of the computation when this function is called, partially
@@ -2845,6 +2844,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
   }
 
   Builder.CreateRetVoid();
+  Builder.restoreIP(SavedIP);
 
   return WcFunc;
 }
@@ -2853,7 +2853,6 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
     AttributeList FuncAttrs) {
   LLVMContext &Ctx = M.getContext();
-  IRBuilder<>::InsertPointGuard IPG(Builder);
   FunctionType *FuncTy =
       FunctionType::get(Builder.getVoidTy(),
                         {Builder.getPtrTy(), Builder.getInt16Ty(),
@@ -2872,7 +2871,6 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
   SarFunc->addParamAttr(3, Attribute::SExt);
   BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
   Builder.SetInsertPoint(EntryBB);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Thread local Reduce list used to host the values of data to be reduced.
   Argument *ReduceListArg = SarFunc->getArg(0);
@@ -3019,7 +3017,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
 Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
     AttributeList FuncAttrs) {
-  IRBuilder<>::InsertPointGuard IPG(Builder);
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3035,7 +3033,6 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
   Builder.SetInsertPoint(EntryBlock);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGCFunc->getArg(0);
@@ -3123,13 +3120,14 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
   }
 
   Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
   return LtGCFunc;
 }
 
 Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
     Type *ReductionsBufferTy, AttributeList FuncAttrs) {
-  IRBuilder<>::InsertPointGuard IPG(Builder);
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3145,7 +3143,6 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
   Builder.SetInsertPoint(EntryBlock);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGRFunc->getArg(0);
@@ -3206,13 +3203,14 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
   Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
       ->addFnAttr(Attribute::NoUnwind);
   Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
   return LtGRFunc;
 }
 
 Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
     AttributeList FuncAttrs) {
-  IRBuilder<>::InsertPointGuard IPG(Builder);
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3228,7 +3226,6 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
   Builder.SetInsertPoint(EntryBlock);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGCFunc->getArg(0);
@@ -3314,13 +3311,14 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
   }
 
   Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
   return LtGCFunc;
 }
 
 Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
     Type *ReductionsBufferTy, AttributeList FuncAttrs) {
-  IRBuilder<>::InsertPointGuard IPG(Builder);
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
   auto *FuncTy = FunctionType::get(
       Builder.getVoidTy(),
@@ -3336,7 +3334,6 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
 
   BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
   Builder.SetInsertPoint(EntryBlock);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Buffer: global reduction buffer.
   Argument *BufferArg = LtGRFunc->getArg(0);
@@ -3397,6 +3394,7 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
   Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
       ->addFnAttr(Attribute::NoUnwind);
   Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
   return LtGRFunc;
 }
 
@@ -3409,7 +3407,6 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
 Expected<Function *> OpenMPIRBuilder::createReductionFunction(
     StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
     ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
-  IRBuilder<>::InsertPointGuard IPG(Builder);
   auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
                                    {Builder.getPtrTy(), Builder.getPtrTy()},
                                    /* IsVarArg */ false);
@@ -3422,7 +3419,6 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
   BasicBlock *EntryBB =
       BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
   Builder.SetInsertPoint(EntryBB);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
 
   // Need to alloca memory here and deal with the pointers before getting
   // LHS/RHS pointers out
@@ -3750,12 +3746,10 @@ static Error populateReductionFunction(
     Function *ReductionFunc,
     ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
     IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
-  IRBuilder<>::InsertPointGuard IPG(Builder);
   Module *Module = ReductionFunc->getParent();
   BasicBlock *ReductionFuncBlock =
       BasicBlock::Create(Module->getContext(), "", ReductionFunc);
   Builder.SetInsertPoint(ReductionFuncBlock);
-  Builder.SetCurrentDebugLocation(llvm::DebugLoc());
   Value *LHSArrayPtr = nullptr;
   Value *RHSArrayPtr = nullptr;
   if (IsGPU) {
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index b94dcac..4f37624 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -81,6 +81,10 @@ void DiagnosticInfoInlineAsm::print(DiagnosticPrinter &DP) const {
     DP << " at line " << getLocCookie();
 }
 
+void DiagnosticInfoLegalizationFailure::print(DiagnosticPrinter &DP) const {
+  DP << getLocationStr() << ": " << getMsgStr();
+}
+
 DiagnosticInfoRegAllocFailure::DiagnosticInfoRegAllocFailure(
     const Twine &MsgStr, const Function &Fn, const DiagnosticLocation &DL,
     DiagnosticSeverity Severity)
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 28037d7..49c6dc7 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1144,9 +1144,32 @@ Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V,
   return CreateShuffleVector(V, Zeros, Name + ".splat");
 }
 
-Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
-    Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex,
-    MDNode *DbgInfo) {
+Value *IRBuilderBase::CreateVectorInterleave(ArrayRef<Value *> Ops,
+                                             const Twine &Name) {
+  assert(Ops.size() >= 2 && Ops.size() <= 8 &&
+         "Unexpected number of operands to interleave");
+
+  // Make sure all operands are the same type.
+  assert(isa<VectorType>(Ops[0]->getType()) && "Unexpected type");
+
+#ifndef NDEBUG
+  for (unsigned I = 1; I < Ops.size(); I++) {
+    assert(Ops[I]->getType() == Ops[0]->getType() &&
+           "Vector interleave expects matching operand types!");
+  }
+#endif
+
+  unsigned IID = Intrinsic::getInterleaveIntrinsicID(Ops.size());
+  auto *SubvecTy = cast<VectorType>(Ops[0]->getType());
+  Type *DestTy = VectorType::get(SubvecTy->getElementType(),
+                                 SubvecTy->getElementCount() * Ops.size());
+  return CreateIntrinsic(IID, {DestTy}, Ops, {}, Name);
+}
+
+Value *IRBuilderBase::CreatePreserveArrayAccessIndex(Type *ElTy, Value *Base,
+                                                     unsigned Dimension,
+                                                     unsigned LastIndex,
+                                                     MDNode *DbgInfo) {
   auto *BaseType = Base->getType();
   assert(isa<PointerType>(BaseType) &&
          "Invalid Base ptr type for preserve.array.access.index.");
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 6c35ade..58a1f74 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -1133,3 +1133,27 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
          "Shouldn't change the signature");
   return NewDecl;
 }
+
+struct InterleaveIntrinsic {
+  Intrinsic::ID Interleave, Deinterleave;
+};
+
+static InterleaveIntrinsic InterleaveIntrinsics[] = {
+    {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
+    {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
+    {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
+    {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
+    {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
+    {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
+    {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
+};
+
+Intrinsic::ID Intrinsic::getInterleaveIntrinsicID(unsigned Factor) {
+  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+  return InterleaveIntrinsics[Factor - 2].Interleave;
+}
+
+Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) {
+  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+  return InterleaveIntrinsics[Factor - 2].Deinterleave;
+}
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 0dbd07f..1157cbe 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const {
     Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct);
     Result.Scope = Info.lookup(LLVMContext::MD_alias_scope);
     Result.NoAlias = Info.lookup(LLVMContext::MD_noalias);
+    Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace);
   }
   return Result;
 }
@@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) {
   setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct);
   setMetadata(LLVMContext::MD_alias_scope, N.Scope);
   setMetadata(LLVMContext::MD_noalias, N.NoAlias);
+  setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace);
 }
 
 void Instruction::setNoSanitizeMetadata() {
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index d662c42..18a85b3 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -43,13 +43,7 @@ add_llvm_component_library(LLVMMC
   MCRegisterInfo.cpp
   MCSchedule.cpp
   MCSection.cpp
-  MCSectionCOFF.cpp
-  MCSectionDXContainer.cpp
-  MCSectionELF.cpp
-  MCSectionGOFF.cpp
   MCSectionMachO.cpp
-  MCSectionWasm.cpp
-  MCSectionXCOFF.cpp
   MCStreamer.cpp
   MCSPIRVStreamer.cpp
   MCSubtargetInfo.cpp
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 9f52b3e..ae8dffc 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -559,20 +559,7 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) {
     } else {
       const MCSectionELF &Section =
           static_cast<const MCSectionELF &>(Symbol.getSection());
-
-      // We may end up with a situation when section symbol is technically
-      // defined, but should not be. That happens because we explicitly
-      // pre-create few .debug_* sections to have accessors.
-      // And if these sections were not really defined in the code, but were
-      // referenced, we simply error out.
-      if (!Section.isRegistered()) {
-        assert(static_cast<const MCSymbolELF &>(Symbol).getType() ==
-               ELF::STT_SECTION);
-        Ctx.reportError(SMLoc(),
-                        "Undefined section reference: " + Symbol.getName());
-        continue;
-      }
-
+      assert(Section.isRegistered());
       if (Mode == NonDwoOnly && isDwoSection(Section))
         continue;
       MSD.SectionIndex = Section.getOrdinal();
@@ -1100,7 +1087,8 @@ uint64_t ELFWriter::writeObject() {
       // Remember the offset into the file for this section.
       const uint64_t SecStart = align(RelSection->getAlign());
 
-      writeRelocations(cast<MCSectionELF>(*RelSection->getLinkedToSection()));
+      writeRelocations(
+          static_cast<const MCSectionELF &>(*RelSection->getLinkedToSection()));
 
       uint64_t SecEnd = W.OS.tell();
       RelSection->setOffsets(SecStart, SecEnd);
@@ -1273,7 +1261,7 @@ bool ELFObjectWriter::useSectionSymbol(const MCValue &Val,
   // that it pointed to another string and subtracting 42 at runtime will
   // produce the wrong value.
   if (Sym->isInSection()) {
-    auto &Sec = cast<MCSectionELF>(Sym->getSection());
+    auto &Sec = static_cast<const MCSectionELF &>(Sym->getSection());
     unsigned Flags = Sec.getFlags();
     if (Flags & ELF::SHF_MERGE) {
       if (C != 0)
@@ -1325,13 +1313,14 @@ bool ELFObjectWriter::checkRelocation(SMLoc Loc, const MCSectionELF *From,
 void ELFObjectWriter::recordRelocation(const MCFragment &F,
                                        const MCFixup &Fixup, MCValue Target,
                                        uint64_t &FixedValue) {
-  const MCSectionELF &Section = cast<MCSectionELF>(*F.getParent());
+  auto &Section = static_cast<const MCSectionELF &>(*F.getParent());
   MCContext &Ctx = getContext();
 
   const auto *SymA = cast_or_null<MCSymbolELF>(Target.getAddSym());
-  const MCSectionELF *SecA = (SymA && SymA->isInSection())
-                                 ? cast<MCSectionELF>(&SymA->getSection())
-                                 : nullptr;
+  const MCSectionELF *SecA =
+      (SymA && SymA->isInSection())
+          ? static_cast<const MCSectionELF *>(&SymA->getSection())
+          : nullptr;
   if (DwoOS && !checkRelocation(Fixup.getLoc(), &Section, SecA))
     return;
 
diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp
index 1871f5f..88188f3 100644
--- a/llvm/lib/MC/GOFFObjectWriter.cpp
+++ b/llvm/lib/MC/GOFFObjectWriter.cpp
@@ -336,7 +336,7 @@ void GOFFWriter::defineSymbols() {
   unsigned Ordinal = 0;
   // Process all sections.
   for (MCSection &S : Asm) {
-    auto &Section = cast<MCSectionGOFF>(S);
+    auto &Section = static_cast<MCSectionGOFF &>(S);
     Section.setOrdinal(++Ordinal);
     defineSectionSymbols(Section);
   }
diff --git a/llvm/lib/MC/MCAsmInfoCOFF.cpp b/llvm/lib/MC/MCAsmInfoCOFF.cpp
index 0b8781c..54717df 100644
--- a/llvm/lib/MC/MCAsmInfoCOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoCOFF.cpp
@@ -12,7 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -49,6 +55,10 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   HasCOFFComdatConstants = true;
 }
 
+bool MCAsmInfoCOFF::useCodeAlign(const MCSection &Sec) const {
+  return Sec.isText();
+}
+
 void MCAsmInfoMicrosoft::anchor() {}
 
 MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default;
@@ -64,3 +74,101 @@ MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() {
   // We don't create constants in comdat sections for MinGW.
   HasCOFFComdatConstants = false;
 }
+
+bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name) const {
+  if (COMDATSymbol || isUnique())
+    return false;
+
+  // FIXME: Does .section .bss/.data/.text work everywhere??
+  if (Name == ".text" || Name == ".data" || Name == ".bss")
+    return true;
+
+  return false;
+}
+
+void MCSectionCOFF::setSelection(int Selection) const {
+  assert(Selection != 0 && "invalid COMDAT selection type");
+  this->Selection = Selection;
+  Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+}
+
+void MCAsmInfoCOFF::printSwitchToSection(const MCSection &Section, uint32_t,
+                                         const Triple &T,
+                                         raw_ostream &OS) const {
+  auto &Sec = static_cast<const MCSectionCOFF &>(Section);
+  // standard sections don't require the '.section'
+  if (Sec.shouldOmitSectionDirective(Sec.getName())) {
+    OS << '\t' << Sec.getName() << '\n';
+    return;
+  }
+
+  OS << "\t.section\t" << Sec.getName() << ",\"";
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+    OS << 'd';
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+    OS << 'b';
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE)
+    OS << 'x';
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE)
+    OS << 'w';
+  else if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_READ)
+    OS << 'r';
+  else
+    OS << 'y';
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE)
+    OS << 'n';
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED)
+    OS << 's';
+  if ((Sec.getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) &&
+      !Sec.isImplicitlyDiscardable(Sec.getName()))
+    OS << 'D';
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO)
+    OS << 'i';
+  OS << '"';
+
+  // unique should be tail of .section directive.
+  if (Sec.isUnique() && !Sec.COMDATSymbol)
+    OS << ",unique," << Sec.UniqueID;
+
+  if (Sec.getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
+    if (Sec.COMDATSymbol)
+      OS << ",";
+    else
+      OS << "\n\t.linkonce\t";
+    switch (Sec.Selection) {
+    case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
+      OS << "one_only";
+      break;
+    case COFF::IMAGE_COMDAT_SELECT_ANY:
+      OS << "discard";
+      break;
+    case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE:
+      OS << "same_size";
+      break;
+    case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
+      OS << "same_contents";
+      break;
+    case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
+      OS << "associative";
+      break;
+    case COFF::IMAGE_COMDAT_SELECT_LARGEST:
+      OS << "largest";
+      break;
+    case COFF::IMAGE_COMDAT_SELECT_NEWEST:
+      OS << "newest";
+      break;
+    default:
+      assert(false && "unsupported COFF selection type");
+      break;
+    }
+    if (Sec.COMDATSymbol) {
+      OS << ",";
+      Sec.COMDATSymbol->print(OS, this);
+    }
+  }
+
+  if (Sec.isUnique() && Sec.COMDATSymbol)
+    OS << ",unique," << Sec.UniqueID;
+
+  OS << '\n';
+}
diff --git a/llvm/lib/MC/MCAsmInfoDarwin.cpp b/llvm/lib/MC/MCAsmInfoDarwin.cpp
index 9cba775..e156fa0 100644
--- a/llvm/lib/MC/MCAsmInfoDarwin.cpp
+++ b/llvm/lib/MC/MCAsmInfoDarwin.cpp
@@ -85,3 +85,8 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   DwarfUsesRelocationsAcrossSections = false;
   SetDirectiveSuppressesReloc = true;
 }
+
+bool MCAsmInfoDarwin::useCodeAlign(const MCSection &Sec) const {
+  return static_cast<const MCSectionMachO &>(Sec).hasAttribute(
+      MachO::S_ATTR_PURE_INSTRUCTIONS);
+}
diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp
index 7eb89ef..cdae9d7 100644
--- a/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -12,9 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -28,9 +35,198 @@ MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
   return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0);
 }
 
+bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const {
+  return static_cast<const MCSectionELF &>(Sec).getFlags() & ELF::SHF_EXECINSTR;
+}
+
 MCAsmInfoELF::MCAsmInfoELF() {
   HasIdentDirective = true;
   WeakRefDirective = "\t.weak\t";
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
 }
+
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
+void MCAsmInfoELF::printSwitchToSection(const MCSection &Section,
+                                        uint32_t Subsection, const Triple &T,
+                                        raw_ostream &OS) const {
+  auto &Sec = static_cast<const MCSectionELF &>(Section);
+  if (!Sec.isUnique() && shouldOmitSectionDirective(Sec.getName())) {
+    OS << '\t' << Sec.getName();
+    if (Subsection)
+      OS << '\t' << Subsection;
+    OS << '\n';
+    return;
+  }
+
+  OS << "\t.section\t";
+  printName(OS, Sec.getName());
+
+  // Handle the weird solaris syntax if desired.
+  if (usesSunStyleELFSectionSwitchSyntax() && !(Sec.Flags & ELF::SHF_MERGE)) {
+    if (Sec.Flags & ELF::SHF_ALLOC)
+      OS << ",#alloc";
+    if (Sec.Flags & ELF::SHF_EXECINSTR)
+      OS << ",#execinstr";
+    if (Sec.Flags & ELF::SHF_WRITE)
+      OS << ",#write";
+    if (Sec.Flags & ELF::SHF_EXCLUDE)
+      OS << ",#exclude";
+    if (Sec.Flags & ELF::SHF_TLS)
+      OS << ",#tls";
+    OS << '\n';
+    return;
+  }
+
+  OS << ",\"";
+  if (Sec.Flags & ELF::SHF_ALLOC)
+    OS << 'a';
+  if (Sec.Flags & ELF::SHF_EXCLUDE)
+    OS << 'e';
+  if (Sec.Flags & ELF::SHF_EXECINSTR)
+    OS << 'x';
+  if (Sec.Flags & ELF::SHF_WRITE)
+    OS << 'w';
+  if (Sec.Flags & ELF::SHF_MERGE)
+    OS << 'M';
+  if (Sec.Flags & ELF::SHF_STRINGS)
+    OS << 'S';
+  if (Sec.Flags & ELF::SHF_TLS)
+    OS << 'T';
+  if (Sec.Flags & ELF::SHF_LINK_ORDER)
+    OS << 'o';
+  if (Sec.Flags & ELF::SHF_GROUP)
+    OS << 'G';
+  if (Sec.Flags & ELF::SHF_GNU_RETAIN)
+    OS << 'R';
+
+  // If there are os-specific flags, print them.
+  if (T.isOSSolaris())
+    if (Sec.Flags & ELF::SHF_SUNW_NODISCARD)
+      OS << 'R';
+
+  // If there are tarSec.get-specific flags, print them.
+  Triple::ArchType Arch = T.getArch();
+  if (Arch == Triple::xcore) {
+    if (Sec.Flags & ELF::XCORE_SHF_CP_SECTION)
+      OS << 'c';
+    if (Sec.Flags & ELF::XCORE_SHF_DP_SECTION)
+      OS << 'd';
+  } else if (T.isARM() || T.isThumb()) {
+    if (Sec.Flags & ELF::SHF_ARM_PURECODE)
+      OS << 'y';
+  } else if (T.isAArch64()) {
+    if (Sec.Flags & ELF::SHF_AARCH64_PURECODE)
+      OS << 'y';
+  } else if (Arch == Triple::hexagon) {
+    if (Sec.Flags & ELF::SHF_HEX_GPREL)
+      OS << 's';
+  } else if (Arch == Triple::x86_64) {
+    if (Sec.Flags & ELF::SHF_X86_64_LARGE)
+      OS << 'l';
+  }
+
+  OS << '"';
+
+  OS << ',';
+
+  // If comment string is '@', e.g. as on ARM - use '%' instead
+  if (getCommentString()[0] == '@')
+    OS << '%';
+  else
+    OS << '@';
+
+  if (Sec.Type == ELF::SHT_INIT_ARRAY)
+    OS << "init_array";
+  else if (Sec.Type == ELF::SHT_FINI_ARRAY)
+    OS << "fini_array";
+  else if (Sec.Type == ELF::SHT_PREINIT_ARRAY)
+    OS << "preinit_array";
+  else if (Sec.Type == ELF::SHT_NOBITS)
+    OS << "nobits";
+  else if (Sec.Type == ELF::SHT_NOTE)
+    OS << "note";
+  else if (Sec.Type == ELF::SHT_PROGBITS)
+    OS << "progbits";
+  else if (Sec.Type == ELF::SHT_X86_64_UNWIND)
+    OS << "unwind";
+  else if (Sec.Type == ELF::SHT_MIPS_DWARF)
+    // Print hex value of the flag while we do not have
+    // any standard symbolic representation of the flag.
+    OS << "0x7000001e";
+  else if (Sec.Type == ELF::SHT_LLVM_ODRTAB)
+    OS << "llvm_odrtab";
+  else if (Sec.Type == ELF::SHT_LLVM_LINKER_OPTIONS)
+    OS << "llvm_linker_options";
+  else if (Sec.Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE)
+    OS << "llvm_call_graph_profile";
+  else if (Sec.Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES)
+    OS << "llvm_dependent_libraries";
+  else if (Sec.Type == ELF::SHT_LLVM_SYMPART)
+    OS << "llvm_sympart";
+  else if (Sec.Type == ELF::SHT_LLVM_BB_ADDR_MAP)
+    OS << "llvm_bb_addr_map";
+  else if (Sec.Type == ELF::SHT_LLVM_OFFLOADING)
+    OS << "llvm_offloading";
+  else if (Sec.Type == ELF::SHT_LLVM_LTO)
+    OS << "llvm_lto";
+  else if (Sec.Type == ELF::SHT_LLVM_JT_SIZES)
+    OS << "llvm_jt_sizes";
+  else if (Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
+    OS << "llvm_cfi_jump_table";
+  else
+    OS << "0x" << Twine::utohexstr(Sec.Type);
+
+  if (Sec.EntrySize) {
+    assert((Sec.Flags & ELF::SHF_MERGE) ||
+           Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE);
+    OS << "," << Sec.EntrySize;
+  }
+
+  if (Sec.Flags & ELF::SHF_LINK_ORDER) {
+    OS << ",";
+    if (Sec.LinkedToSym)
+      printName(OS, Sec.LinkedToSym->getName());
+    else
+      OS << '0';
+  }
+
+  if (Sec.Flags & ELF::SHF_GROUP) {
+    OS << ",";
+    printName(OS, Sec.Group.getPointer()->getName());
+    if (Sec.isComdat())
+      OS << ",comdat";
+  }
+
+  if (Sec.isUnique())
+    OS << ",unique," << Sec.UniqueID;
+
+  OS << '\n';
+
+  if (Subsection) {
+    OS << "\t.subsection\t" << Subsection;
+    OS << '\n';
+  }
+}
diff --git a/llvm/lib/MC/MCAsmInfoGOFF.cpp b/llvm/lib/MC/MCAsmInfoGOFF.cpp
index 3c81a46..0a5d1927 100644
--- a/llvm/lib/MC/MCAsmInfoGOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoGOFF.cpp
@@ -13,11 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoGOFF.h"
+#include "llvm/BinaryFormat/GOFF.h"
+#include "llvm/MC/MCSectionGOFF.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-void MCAsmInfoGOFF::anchor() {}
-
 MCAsmInfoGOFF::MCAsmInfoGOFF() {
   Data64bitsDirective = "\t.quad\t";
   HasDotTypeDotSizeDirective = false;
@@ -25,3 +26,136 @@ MCAsmInfoGOFF::MCAsmInfoGOFF() {
   PrivateLabelPrefix = "L#";
   ZeroDirective = "\t.space\t";
 }
+
+static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
+                      GOFF::ESDAlignment Alignment,
+                      GOFF::ESDLoadingBehavior LoadBehavior,
+                      GOFF::ESDExecutable Executable, bool IsReadOnly,
+                      uint32_t SortKey, uint8_t FillByteValue,
+                      StringRef PartName) {
+  OS << Name << " CATTR ";
+  OS << "ALIGN(" << static_cast<unsigned>(Alignment) << "),"
+     << "FILL(" << static_cast<unsigned>(FillByteValue) << ")";
+  switch (LoadBehavior) {
+  case GOFF::ESD_LB_Deferred:
+    OS << ",DEFLOAD";
+    break;
+  case GOFF::ESD_LB_NoLoad:
+    OS << ",NOLOAD";
+    break;
+  default:
+    break;
+  }
+  switch (Executable) {
+  case GOFF::ESD_EXE_CODE:
+    OS << ",EXECUTABLE";
+    break;
+  case GOFF::ESD_EXE_DATA:
+    OS << ",NOTEXECUTABLE";
+    break;
+  default:
+    break;
+  }
+  if (IsReadOnly)
+    OS << ",READONLY";
+  if (Rmode != GOFF::ESD_RMODE_None) {
+    OS << ',';
+    OS << "RMODE(";
+    switch (Rmode) {
+    case GOFF::ESD_RMODE_24:
+      OS << "24";
+      break;
+    case GOFF::ESD_RMODE_31:
+      OS << "31";
+      break;
+    case GOFF::ESD_RMODE_64:
+      OS << "64";
+      break;
+    case GOFF::ESD_RMODE_None:
+      break;
+    }
+    OS << ')';
+  }
+  if (SortKey)
+    OS << ",PRIORITY(" << SortKey << ")";
+  if (!PartName.empty())
+    OS << ",PART(" << PartName << ")";
+  OS << '\n';
+}
+
+static void emitXATTR(raw_ostream &OS, StringRef Name,
+                      GOFF::ESDLinkageType Linkage,
+                      GOFF::ESDExecutable Executable,
+                      GOFF::ESDBindingScope BindingScope) {
+  OS << Name << " XATTR ";
+  OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),";
+  if (Executable != GOFF::ESD_EXE_Unspecified)
+    OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA")
+       << "),";
+  if (BindingScope != GOFF::ESD_BSC_Unspecified) {
+    OS << "SCOPE(";
+    switch (BindingScope) {
+    case GOFF::ESD_BSC_Section:
+      OS << "SECTION";
+      break;
+    case GOFF::ESD_BSC_Module:
+      OS << "MODULE";
+      break;
+    case GOFF::ESD_BSC_Library:
+      OS << "LIBRARY";
+      break;
+    case GOFF::ESD_BSC_ImportExport:
+      OS << "EXPORT";
+      break;
+    default:
+      break;
+    }
+    OS << ')';
+  }
+  OS << '\n';
+}
+
+void MCAsmInfoGOFF::printSwitchToSection(const MCSection &Section,
+                                         uint32_t Subsection, const Triple &T,
+                                         raw_ostream &OS) const {
+  auto &Sec =
+      const_cast<MCSectionGOFF &>(static_cast<const MCSectionGOFF &>(Section));
+  switch (Sec.SymbolType) {
+  case GOFF::ESD_ST_SectionDefinition: {
+    OS << Sec.getName() << " CSECT\n";
+    Sec.Emitted = true;
+    break;
+  }
+  case GOFF::ESD_ST_ElementDefinition: {
+    printSwitchToSection(*Sec.getParent(), Subsection, T, OS);
+    if (!Sec.Emitted) {
+      emitCATTR(OS, Sec.getName(), Sec.EDAttributes.Rmode,
+                Sec.EDAttributes.Alignment, Sec.EDAttributes.LoadBehavior,
+                GOFF::ESD_EXE_Unspecified, Sec.EDAttributes.IsReadOnly, 0,
+                Sec.EDAttributes.FillByteValue, StringRef());
+      Sec.Emitted = true;
+    } else
+      OS << Sec.getName() << " CATTR\n";
+    break;
+  }
+  case GOFF::ESD_ST_PartReference: {
+    MCSectionGOFF *ED = Sec.getParent();
+    printSwitchToSection(*ED->getParent(), Subsection, T, OS);
+    if (!Sec.Emitted) {
+      emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode,
+                ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior,
+                Sec.PRAttributes.Executable, ED->EDAttributes.IsReadOnly,
+                Sec.PRAttributes.SortKey, ED->EDAttributes.FillByteValue,
+                Sec.getName());
+      emitXATTR(OS, Sec.getName(), Sec.PRAttributes.Linkage,
+                Sec.PRAttributes.Executable, Sec.PRAttributes.BindingScope);
+      ED->Emitted = true;
+      Sec.Emitted = true;
+    } else
+      OS << ED->getName() << " CATTR PART(" << Sec.getName() << ")\n";
+    break;
+  }
+  default:
+    llvm_unreachable("Wrong section type");
+  }
+}
diff --git a/llvm/lib/MC/MCAsmInfoWasm.cpp b/llvm/lib/MC/MCAsmInfoWasm.cpp
index ce6ec7e..5e44f48 100644
--- a/llvm/lib/MC/MCAsmInfoWasm.cpp
+++ b/llvm/lib/MC/MCAsmInfoWasm.cpp
@@ -12,9 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoWasm.h"
-using namespace llvm;
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/raw_ostream.h"
 
-void MCAsmInfoWasm::anchor() {}
+using namespace llvm;
 
 MCAsmInfoWasm::MCAsmInfoWasm() {
   HasIdentDirective = true;
@@ -23,3 +25,80 @@ MCAsmInfoWasm::MCAsmInfoWasm() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
 }
+
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
+void MCAsmInfoWasm::printSwitchToSection(const MCSection &Section,
+                                         uint32_t Subsection, const Triple &T,
+                                         raw_ostream &OS) const {
+  auto &Sec = static_cast<const MCSectionWasm &>(Section);
+  if (shouldOmitSectionDirective(Sec.getName())) {
+    OS << '\t' << Sec.getName();
+    if (Subsection)
+      OS << '\t' << Subsection;
+    OS << '\n';
+    return;
+  }
+
+  OS << "\t.section\t";
+  printName(OS, Sec.getName());
+  OS << ",\"";
+
+  if (Sec.IsPassive)
+    OS << 'p';
+  if (Sec.Group)
+    OS << 'G';
+  if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS)
+    OS << 'S';
+  if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_TLS)
+    OS << 'T';
+  if (Sec.SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN)
+    OS << 'R';
+
+  OS << '"';
+
+  OS << ',';
+
+  // If comment string is '@', e.g. as on ARM - use '%' instead
+  if (getCommentString()[0] == '@')
+    OS << '%';
+  else
+    OS << '@';
+
+  // TODO: Print section type.
+
+  if (Sec.Group) {
+    OS << ",";
+    printName(OS, Sec.Group->getName());
+    OS << ",comdat";
+  }
+
+  if (Sec.isUnique())
+    OS << ",unique," << Sec.UniqueID;
+
+  OS << '\n';
+
+  if (Subsection)
+    OS << "\t.subsection\t" << Subsection << '\n';
+}
diff --git a/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
index 6ef11ba..0403b44 100644
--- a/llvm/lib/MC/MCAsmInfoXCOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
@@ -8,7 +8,11 @@
 
 #include "llvm/MC/MCAsmInfoXCOFF.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -16,8 +20,6 @@ namespace llvm {
 extern cl::opt<cl::boolOrDefault> UseLEB128Directives;
 }
 
-void MCAsmInfoXCOFF::anchor() {}
-
 MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
   IsAIX = true;
   IsLittleEndian = false;
@@ -56,3 +58,121 @@ bool MCAsmInfoXCOFF::isAcceptableChar(char C) const {
   // any combination of these.
   return isAlnum(C) || C == '_' || C == '.';
 }
+
+bool MCAsmInfoXCOFF::useCodeAlign(const MCSection &Sec) const {
+  return static_cast<const MCSectionXCOFF &>(Sec).getKind().isText();
+}
+
+MCSectionXCOFF::~MCSectionXCOFF() = default;
+
+void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const {
+  OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n';
+}
+
+void MCAsmInfoXCOFF::printSwitchToSection(const MCSection &Section, uint32_t,
+                                          const Triple &T,
+                                          raw_ostream &OS) const {
+  auto &Sec = static_cast<const MCSectionXCOFF &>(Section);
+  if (Sec.getKind().isText()) {
+    if (Sec.getMappingClass() != XCOFF::XMC_PR)
+      report_fatal_error("Unhandled storage-mapping class for .text csect");
+
+    Sec.printCsectDirective(OS);
+    return;
+  }
+
+  if (Sec.getKind().isReadOnly()) {
+    if (Sec.getMappingClass() != XCOFF::XMC_RO &&
+        Sec.getMappingClass() != XCOFF::XMC_TD)
+      report_fatal_error("Unhandled storage-mapping class for .rodata csect.");
+    Sec.printCsectDirective(OS);
+    return;
+  }
+
+  if (Sec.getKind().isReadOnlyWithRel()) {
+    if (Sec.getMappingClass() != XCOFF::XMC_RW &&
+        Sec.getMappingClass() != XCOFF::XMC_RO &&
+        Sec.getMappingClass() != XCOFF::XMC_TD)
+      report_fatal_error(
+          "Unexepected storage-mapping class for ReadOnlyWithRel kind");
+    Sec.printCsectDirective(OS);
+    return;
+  }
+
+  // Initialized TLS data.
+  if (Sec.getKind().isThreadData()) {
+    // We only expect XMC_TL here for initialized TLS data.
+    if (Sec.getMappingClass() != XCOFF::XMC_TL)
+      report_fatal_error("Unhandled storage-mapping class for .tdata csect.");
+    Sec.printCsectDirective(OS);
+    return;
+  }
+
+  if (Sec.getKind().isData()) {
+    switch (Sec.getMappingClass()) {
+    case XCOFF::XMC_RW:
+    case XCOFF::XMC_DS:
+    case XCOFF::XMC_TD:
+      Sec.printCsectDirective(OS);
+      break;
+    case XCOFF::XMC_TC:
+    case XCOFF::XMC_TE:
+      break;
+    case XCOFF::XMC_TC0:
+      OS << "\t.toc\n";
+      break;
+    default:
+      report_fatal_error("Unhandled storage-mapping class for .data csect.");
+    }
+    return;
+  }
+
+  if (Sec.isCsect() && Sec.getMappingClass() == XCOFF::XMC_TD) {
+    // Common csect type (uninitialized storage) does not have to print
+    // csect directive for section switching unless it is local.
+    if (Sec.getKind().isCommon() && !Sec.getKind().isBSSLocal())
+      return;
+
+    assert(Sec.getKind().isBSS() && "Unexpected section kind for toc-data");
+    Sec.printCsectDirective(OS);
+    return;
+  }
+  // Common csect type (uninitialized storage) does not have to print csect
+  // directive for section switching.
+  if (Sec.isCsect() && Sec.getCSectType() == XCOFF::XTY_CM) {
+    assert((Sec.getMappingClass() == XCOFF::XMC_RW ||
+            Sec.getMappingClass() == XCOFF::XMC_BS ||
+            Sec.getMappingClass() == XCOFF::XMC_UL) &&
+           "Generated a storage-mapping class for a common/bss/tbss csect we "
+           "don't "
+           "understand how to switch to.");
+    // Common symbols and local zero-initialized symbols for TLS and Non-TLS are
+    // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to
+    // cover TLS common and zero-initialized local symbols since linkage type
+    // (in the GlobalVariable) is not accessible in this class.
+    assert((Sec.getKind().isBSSLocal() || Sec.getKind().isCommon() ||
+            Sec.getKind().isThreadBSS()) &&
+           "wrong symbol type for .bss/.tbss csect");
+    // Don't have to print a directive for switching to section for commons
+    // and zero-initialized TLS data. The '.comm' and '.lcomm' directives of the
+    // variable will create the needed csect.
+    return;
+  }
+
+  // Zero-initialized TLS data with weak or external linkage are not eligible to
+  // be put into common csect.
+  if (Sec.getKind().isThreadBSS()) {
+    Sec.printCsectDirective(OS);
+    return;
+  }
+
+  // XCOFF debug sections.
+  if (Sec.getKind().isMetadata() && Sec.isDwarfSect()) {
+    OS << "\n\t.dwsect " << format("0x%" PRIx32, *Sec.getDwarfSubtypeFlags())
+       << '\n';
+    OS << Sec.getName() << ':' << '\n';
+    return;
+  }
+
+  report_fatal_error("Printing for this SectionKind is unimplemented.");
+}
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 7119ef4..da51da4 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -532,8 +532,8 @@ void MCAsmStreamer::switchSection(MCSection *Section, uint32_t Subsection) {
     if (MCTargetStreamer *TS = getTargetStreamer()) {
       TS->changeSection(Cur.first, Section, Subsection, OS);
     } else {
-      Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
-                                    Subsection);
+      MAI->printSwitchToSection(*Section, Subsection,
+                                getContext().getTargetTriple(), OS);
     }
   }
   MCStreamer::switchSection(Section, Subsection);
@@ -543,7 +543,7 @@ bool MCAsmStreamer::popSection() {
   if (!MCStreamer::popSection())
     return false;
   auto [Sec, Subsec] = getCurrentSection();
-  Sec->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS, Subsec);
+  MAI->printSwitchToSection(*Sec, Subsec, getContext().getTargetTriple(), OS);
   return true;
 }
 
@@ -1105,7 +1105,7 @@ void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
 
-  assert(Section->getVariant() == MCSection::SV_MachO &&
+  assert(getContext().getObjectFileType() == MCContext::IsMachO &&
          ".zerofill is a Mach-O specific directive");
   // This is a mach-o specific directive.
 
@@ -1130,7 +1130,7 @@ void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
 
   // Instead of using the Section we'll just use the shortcut.
 
-  assert(Section->getVariant() == MCSection::SV_MachO &&
+  assert(getContext().getObjectFileType() == MCContext::IsMachO &&
          ".zerofill is a Mach-O specific directive");
   // This is a mach-o specific directive and section.
 
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index e142ac1..8500fd1 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -106,7 +106,6 @@ void MCAssembler::reset() {
 bool MCAssembler::registerSection(MCSection &Section) {
   if (Section.isRegistered())
     return false;
-  assert(Section.curFragList()->Head && "allocInitialFragment not called");
   Sections.push_back(&Section);
   Section.setIsRegistered(true);
   return true;
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 12b3fba..39bf628 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -200,16 +200,6 @@ MCInst *MCContext::createMCInst() {
   return new (MCInstAllocator.Allocate()) MCInst;
 }
 
-// Allocate the initial MCFragment for the begin symbol.
-MCFragment *MCContext::allocInitialFragment(MCSection &Sec) {
-  assert(!Sec.curFragList()->Head);
-  auto *F = allocFragment<MCFragment>();
-  F->setParent(&Sec);
-  Sec.curFragList()->Head = F;
-  Sec.curFragList()->Tail = F;
-  return F;
-}
-
 //===----------------------------------------------------------------------===//
 // Symbol Manipulation
 //===----------------------------------------------------------------------===//
@@ -443,17 +433,19 @@ MCSymbol *MCContext::getDirectionalLocalSymbol(unsigned LocalLabelVal,
   return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance);
 }
 
+// Create a section symbol, with a distinct one for each section of the same.
+// The first symbol is used for assembly code references.
 template <typename Symbol>
 Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) {
   Symbol *R;
   auto &SymEntry = getSymbolTableEntry(Section);
   MCSymbol *Sym = SymEntry.second.Symbol;
-  // A section symbol can not redefine regular symbols. There may be multiple
-  // sections with the same name, in which case the first such section wins.
   if (Sym && Sym->isDefined() &&
       (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
     reportError(SMLoc(), "invalid symbol redefinition");
-  if (Sym && Sym->isUndefined()) {
+  // Use the symbol's index to track if it has been used as a section symbol.
+  // Set to -1 to catch potential bugs if misused as a symbol index.
+  if (Sym && Sym->getIndex() != -1u) {
     R = cast<Symbol>(Sym);
   } else {
     SymEntry.second.Used = true;
@@ -461,6 +453,8 @@ Symbol *MCContext::getOrCreateSectionSymbol(StringRef Section) {
     if (!Sym)
       SymEntry.second.Symbol = R;
   }
+  // Mark as section symbol.
+  R->setIndex(-1u);
   return R;
 }
 
@@ -568,7 +562,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
       MCSectionMachO(Segment, Name.substr(Name.size() - Section.size()),
                      TypeAndAttributes, Reserved2, Kind, Begin);
   R.first->second = Ret;
-  allocInitialFragment(*Ret);
   return Ret;
 }
 
@@ -579,15 +572,8 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
                                               bool Comdat, unsigned UniqueID,
                                               const MCSymbolELF *LinkedToSym) {
   auto *R = getOrCreateSectionSymbol<MCSymbolELF>(Section);
-  R->setBinding(ELF::STB_LOCAL);
-  R->setType(ELF::STT_SECTION);
-
-  auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
+  return new (ELFAllocator.Allocate()) MCSectionELF(
       Section, Type, Flags, EntrySize, Group, Comdat, UniqueID, R, LinkedToSym);
-
-  auto *F = allocInitialFragment(*Ret);
-  R->setFragment(F);
-  return Ret;
 }
 
 MCSectionELF *
@@ -743,7 +729,6 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name,
       MCSectionGOFF(CachedName, Kind, IsVirtual, Attributes,
                     static_cast<MCSectionGOFF *>(Parent));
   Iter->second = GOFFSection;
-  allocInitialFragment(*GOFFSection);
   return GOFFSection;
 }
 
@@ -782,8 +767,8 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
     if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE &&
         COMDATSymbol->isDefined() &&
         (!COMDATSymbol->isInSection() ||
-         cast<MCSectionCOFF>(COMDATSymbol->getSection()).getCOMDATSymbol() !=
-             COMDATSymbol))
+         static_cast<const MCSectionCOFF &>(COMDATSymbol->getSection())
+                 .getCOMDATSymbol() != COMDATSymbol))
       reportError(SMLoc(), "invalid symbol redefinition");
   }
 
@@ -798,8 +783,7 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
   MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF(
       CachedName, Characteristics, COMDATSymbol, Selection, UniqueID, Begin);
   Iter->second = Result;
-  auto *F = allocInitialFragment(*Result);
-  Begin->setFragment(F);
+  Begin->setFragment(&Result->getDummyFragment());
   return Result;
 }
 
@@ -870,8 +854,6 @@ MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
       MCSectionWasm(CachedName, Kind, Flags, GroupSym, UniqueID, Begin);
   Entry.second = Result;
 
-  auto *F = allocInitialFragment(*Result);
-  Begin->setFragment(F);
   return Result;
 }
 
@@ -927,24 +909,11 @@ MCSectionXCOFF *MCContext::getXCOFFSection(
                        MultiSymbolsAllowed);
 
   Entry.second = Result;
-
-  auto *F = allocInitialFragment(*Result);
-
-  // We might miss calculating the symbols difference as absolute value before
-  // adding fixups when symbol_A without the fragment set is the csect itself
-  // and symbol_B is in it.
-  // TODO: Currently we only set the fragment for XMC_PR csects and DWARF
-  // sections because we don't have other cases that hit this problem yet.
-  if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR)
-    QualName->setFragment(F);
-
   return Result;
 }
 
 MCSectionSPIRV *MCContext::getSPIRVSection() {
   MCSectionSPIRV *Result = new (SPIRVAllocator.Allocate()) MCSectionSPIRV();
-
-  allocInitialFragment(*Result);
   return Result;
 }
 
@@ -964,7 +933,6 @@ MCSectionDXContainer *MCContext::getDXContainerSection(StringRef Section,
       new (DXCAllocator.Allocate()) MCSectionDXContainer(Name, K, nullptr);
 
   // The first fragment will store the header
-  allocInitialFragment(*MapIt->second);
   return MapIt->second;
 }
 
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index b8cbaea5..38744a0 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -89,7 +89,9 @@ void MCELFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
     getWriter().markGnuAbi();
 
   MCObjectStreamer::changeSection(Section, Subsection);
-  Asm.registerSymbol(*Section->getBeginSymbol());
+  auto *Sym = static_cast<MCSymbolELF *>(Section->getBeginSymbol());
+  Sym->setBinding(ELF::STB_LOCAL);
+  Sym->setType(ELF::STT_SECTION);
 }
 
 void MCELFStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Target) {
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index dbb2fd1..c24c82d 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -346,17 +346,16 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
       Displacement *= -1;
     }
 
-    // Track whether B is before a relaxable instruction and whether A is after
-    // a relaxable instruction. If SA and SB are separated by a linker-relaxable
-    // instruction, the difference cannot be resolved as it may be changed by
-    // the linker.
+    // Track whether B is before a relaxable instruction/alignment and whether A
+    // is after a relaxable instruction/alignment. If SA and SB are separated by
+    // a linker-relaxable instruction/alignment, the difference cannot be
+    // resolved as it may be changed by the linker.
     bool BBeforeRelax = false, AAfterRelax = false;
     for (auto F = FB; F; F = F->getNext()) {
-      auto DF = F->getKind() == MCFragment::FT_Data ? F : nullptr;
-      if (DF && DF->isLinkerRelaxable()) {
-        if (&*F != FB || SBOffset != DF->getContents().size())
+      if (F && F->isLinkerRelaxable()) {
+        if (&*F != FB || SBOffset != F->getSize())
           BBeforeRelax = true;
-        if (&*F != FA || SAOffset == DF->getContents().size())
+        if (&*F != FA || SAOffset == F->getSize())
           AAfterRelax = true;
         if (BBeforeRelax && AAfterRelax)
           return;
@@ -370,17 +369,15 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
       }
 
       int64_t Num;
-      if (DF) {
-        Displacement += DF->getContents().size();
-      } else if (F->getKind() == MCFragment::FT_Relaxable &&
+      if (F->getKind() == MCFragment::FT_Data) {
+        Displacement += F->getFixedSize();
+      } else if ((F->getKind() == MCFragment::FT_Relaxable ||
+                  F->getKind() == MCFragment::FT_Align) &&
                  Asm->hasFinalLayout()) {
         // Before finishLayout, a relaxable fragment's size is indeterminate.
         // After layout, during relocation generation, it can be treated as a
         // data fragment.
         Displacement += F->getSize();
-      } else if (F->getKind() == MCFragment::FT_Align && Layout &&
-                 F->isLinkerRelaxable()) {
-        Displacement += Asm->computeFragmentSize(*F);
       } else if (auto *FF = dyn_cast<MCFillFragment>(F);
                  FF && FF->getNumValues().evaluateAsAbsolute(Num)) {
         Displacement += Num * FF->getValueSize();
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 3c395e5..6cbdf74 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -35,7 +35,7 @@ MCFragment::MCFragment(FragmentType Kind, bool HasInstructions)
 }
 
 const MCSymbol *MCFragment::getAtom() const {
-  return cast<MCSectionMachO>(Parent)->getAtom(LayoutOrder);
+  return static_cast<const MCSectionMachO *>(Parent)->getAtom(LayoutOrder);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 8c3332c..a214513 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -140,6 +140,8 @@ void MCMachOStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
     MCSymbol *Label = getContext().createLinkerPrivateTempSymbol();
     Section->setBeginSymbol(Label);
     HasSectionLabel[Section] = true;
+    if (!Label->isInSection())
+      emitLabel(Label);
   }
 }
 
@@ -441,13 +443,13 @@ void MCMachOStreamer::finishImpl() {
   // Set the fragment atom associations by tracking the last seen atom defining
   // symbol.
   for (MCSection &Sec : getAssembler()) {
-    cast<MCSectionMachO>(Sec).allocAtoms();
+    static_cast<MCSectionMachO &>(Sec).allocAtoms();
     const MCSymbol *CurrentAtom = nullptr;
     size_t I = 0;
     for (MCFragment &Frag : Sec) {
       if (const MCSymbol *Symbol = DefiningSymbolMap.lookup(&Frag))
         CurrentAtom = Symbol;
-      cast<MCSectionMachO>(Sec).setAtom(I++, CurrentAtom);
+      static_cast<MCSectionMachO &>(Sec).setAtom(I++, CurrentAtom);
     }
   }
 
@@ -482,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() {
   // For each entry, reserve space for 2 32-bit indices and a 64-bit count.
   size_t SectionBytes =
       W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
-  (*CGProfileSection->begin()).appendContents(SectionBytes, 0);
+  (*CGProfileSection->begin())
+      .setVarContents(std::vector<char>(SectionBytes, 0));
 }
 
 MCStreamer *llvm::createMachOStreamer(MCContext &Context,
@@ -518,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() {
   // (instead of emitting a zero-sized section) so these relocations are
   // technically valid, even though we don't expect these relocations to
   // actually be applied by the linker.
-  Frag->appendContents(8, 0);
+  constexpr char zero[8] = {};
+  Frag->setVarContents(zero);
 }
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index fcd5cbf..e82393a 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -46,23 +46,83 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() {
   return nullptr;
 }
 
+constexpr size_t FragBlockSize = 16384;
+// Ensure the new fragment can at least store a few bytes.
+constexpr size_t NewFragHeadroom = 8;
+
+static_assert(NewFragHeadroom >= alignof(MCFragment));
+static_assert(FragBlockSize >= sizeof(MCFragment) + NewFragHeadroom);
+
+MCFragment *MCObjectStreamer::allocFragSpace(size_t Headroom) {
+  auto Size = std::max(FragBlockSize, sizeof(MCFragment) + Headroom);
+  FragSpace = Size - sizeof(MCFragment);
+  auto Chunk = std::unique_ptr<char[]>(new char[Size]);
+  auto *F = reinterpret_cast<MCFragment *>(Chunk.get());
+  FragStorage.push_back(std::move(Chunk));
+  return F;
+}
+
 void MCObjectStreamer::newFragment() {
-  addFragment(getContext().allocFragment<MCFragment>());
+  MCFragment *F;
+  if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+    auto End = reinterpret_cast<size_t>(getCurFragEnd());
+    F = reinterpret_cast<MCFragment *>(
+        alignToPowerOf2(End, alignof(MCFragment)));
+    FragSpace -= size_t(F) - End + sizeof(MCFragment);
+  } else {
+    F = allocFragSpace(0);
+  }
+  new (F) MCFragment();
+  addFragment(F);
 }
 
-void MCObjectStreamer::insert(MCFragment *F) {
-  assert(F->getKind() != MCFragment::FT_Data &&
+void MCObjectStreamer::ensureHeadroom(size_t Headroom) {
+  if (Headroom <= FragSpace)
+    return;
+  auto *F = allocFragSpace(Headroom);
+  new (F) MCFragment();
+  addFragment(F);
+}
+
+void MCObjectStreamer::insert(MCFragment *Frag) {
+  assert(Frag->getKind() != MCFragment::FT_Data &&
          "F should have a variable-size tail");
+  // Frag is not connected to FragSpace. Before modifying CurFrag with
+  // addFragment(Frag), allocate an empty fragment to maintain FragSpace
+  // connectivity, potentially reusing CurFrag's associated space.
+  MCFragment *F;
+  if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+    auto End = reinterpret_cast<size_t>(getCurFragEnd());
+    F = reinterpret_cast<MCFragment *>(
+        alignToPowerOf2(End, alignof(MCFragment)));
+    FragSpace -= size_t(F) - End + sizeof(MCFragment);
+  } else {
+    F = allocFragSpace(0);
+  }
+  new (F) MCFragment();
+
+  addFragment(Frag);
   addFragment(F);
-  newFragment();
+}
+
+void MCObjectStreamer::appendContents(ArrayRef<char> Contents) {
+  ensureHeadroom(Contents.size());
+  assert(FragSpace >= Contents.size());
+  llvm::copy(Contents, getCurFragEnd());
+  CurFrag->FixedSize += Contents.size();
+  FragSpace -= Contents.size();
 }
 
 void MCObjectStreamer::appendContents(size_t Num, char Elt) {
-  CurFrag->appendContents(Num, Elt);
+  ensureHeadroom(Num);
+  MutableArrayRef<char> Data(getCurFragEnd(), Num);
+  llvm::fill(Data, Elt);
+  CurFrag->FixedSize += Num;
+  FragSpace -= Num;
 }
 
 void MCObjectStreamer::addFixup(const MCExpr *Value, MCFixupKind Kind) {
-  CurFrag->addFixup(MCFixup::create(CurFrag->getFixedSize(), Value, Kind));
+  CurFrag->addFixup(MCFixup::create(getCurFragSize(), Value, Kind));
 }
 
 // As a compile-time optimization, avoid allocating and evaluating an MCExpr
@@ -111,6 +171,8 @@ void MCObjectStreamer::reset() {
   }
   EmitEHFrame = true;
   EmitDebugFrame = false;
+  FragStorage.clear();
+  FragSpace = 0;
   MCStreamer::reset();
 }
 
@@ -139,7 +201,6 @@ void MCObjectStreamer::emitCFISections(bool EH, bool Debug, bool SFrame) {
 void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                      SMLoc Loc) {
   MCStreamer::emitValueImpl(Value, Size, Loc);
-  MCFragment *DF = getCurrentFragment();
 
   MCDwarfLineEntry::make(this, getCurrentSectionOnly());
 
@@ -154,9 +215,9 @@ void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
     emitIntValue(AbsValue, Size);
     return;
   }
-  DF->addFixup(MCFixup::create(DF->getContents().size(), Value,
-                               MCFixup::getDataKindForSize(Size)));
-  DF->appendContents(Size, 0);
+  ensureHeadroom(Size);
+  addFixup(Value, MCFixup::getDataKindForSize(Size));
+  appendContents(Size, 0);
 }
 
 MCSymbol *MCObjectStreamer::emitCFILabel() {
@@ -185,12 +246,12 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
 
   getAssembler().registerSymbol(*Symbol);
 
-  // If there is a current fragment, mark the symbol as pointing into it.
-  // Otherwise queue the label and set its fragment pointer when we emit the
-  // next fragment.
-  MCFragment *F = getCurrentFragment();
+  // Set the fragment and offset. This function might be called by
+  // changeSection, when the section stack top hasn't been changed to the new
+  // section.
+  MCFragment *F = CurFrag;
   Symbol->setFragment(F);
-  Symbol->setOffset(F->getContents().size());
+  Symbol->setOffset(F->getFixedSize());
 
   emitPendingAssignments(Symbol);
 }
@@ -247,6 +308,30 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   assert(Section && "Cannot switch to a null section!");
   getContext().clearDwarfLocSeen();
 
+  // Register the section and create an initial fragment for subsection 0
+  // if `Subsection` is non-zero.
+  bool NewSec = getAssembler().registerSection(*Section);
+  MCFragment *F0 = nullptr;
+  if (NewSec && Subsection) {
+    changeSection(Section, 0);
+    F0 = CurFrag;
+  }
+
+  // To maintain connectivity between CurFrag and FragSpace when CurFrag is
+  // modified, allocate an empty fragment and append it to the fragment list.
+  // (Subsections[I].second.Tail is not connected to FragSpace.)
+  MCFragment *F;
+  if (LLVM_LIKELY(sizeof(MCFragment) + NewFragHeadroom <= FragSpace)) {
+    auto End = reinterpret_cast<size_t>(getCurFragEnd());
+    F = reinterpret_cast<MCFragment *>(
+        alignToPowerOf2(End, alignof(MCFragment)));
+    FragSpace -= size_t(F) - End + sizeof(MCFragment);
+  } else {
+    F = allocFragSpace(0);
+  }
+  new (F) MCFragment();
+  F->setParent(Section);
+
   auto &Subsections = Section->Subsections;
   size_t I = 0, E = Subsections.size();
   while (I != E && Subsections[I].first < Subsection)
@@ -254,20 +339,24 @@ void MCObjectStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   // If the subsection number is not in the sorted Subsections list, create a
   // new fragment list.
   if (I == E || Subsections[I].first != Subsection) {
-    auto *F = getContext().allocFragment<MCFragment>();
-    F->setParent(Section);
     Subsections.insert(Subsections.begin() + I,
                        {Subsection, MCSection::FragList{F, F}});
+    Section->CurFragList = &Subsections[I].second;
+    CurFrag = F;
+  } else {
+    Section->CurFragList = &Subsections[I].second;
+    CurFrag = Subsections[I].second.Tail;
+    // Ensure CurFrag is associated with FragSpace.
+    addFragment(F);
   }
-  Section->CurFragList = &Subsections[I].second;
-  CurFrag = Section->CurFragList->Tail;
-
-  getAssembler().registerSection(*Section);
-}
 
-void MCObjectStreamer::switchSectionNoPrint(MCSection *Section) {
-  MCStreamer::switchSectionNoPrint(Section);
-  changeSection(Section, 0);
+  // Define the section symbol at subsection 0's initial fragment if required.
+  if (!NewSec)
+    return;
+  if (auto *Sym = Section->getBeginSymbol()) {
+    Sym->setFragment(Subsection ? F0 : CurFrag);
+    getAssembler().registerSymbol(*Sym);
+  }
 }
 
 void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
@@ -330,31 +419,37 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
   MCFragment *F = getCurrentFragment();
 
   // Append the instruction to the data fragment.
-  size_t FixupStartIndex = F->getFixups().size();
-  size_t CodeOffset = F->getContents().size();
+  size_t CodeOffset = getCurFragSize();
+  SmallString<16> Content;
   SmallVector<MCFixup, 1> Fixups;
-  getAssembler().getEmitter().encodeInstruction(
-      Inst, F->getContentsForAppending(), Fixups, STI);
-  F->doneAppending();
-  if (!Fixups.empty())
-    F->appendFixups(Fixups);
+  getAssembler().getEmitter().encodeInstruction(Inst, Content, Fixups, STI);
+  appendContents(Content);
+  if (CurFrag != F) {
+    F = CurFrag;
+    CodeOffset = 0;
+  }
   F->setHasInstructions(STI);
 
+  if (Fixups.empty())
+    return;
   bool MarkedLinkerRelaxable = false;
-  for (auto &Fixup : MutableArrayRef(F->getFixups()).slice(FixupStartIndex)) {
+  for (auto &Fixup : Fixups) {
     Fixup.setOffset(Fixup.getOffset() + CodeOffset);
-    if (!Fixup.isLinkerRelaxable())
+    if (!Fixup.isLinkerRelaxable() || MarkedLinkerRelaxable)
       continue;
-    F->setLinkerRelaxable();
+    MarkedLinkerRelaxable = true;
+    // Set the fragment's order within the subsection for use by
+    // MCAssembler::relaxAlign.
+    auto *Sec = F->getParent();
+    if (!Sec->isLinkerRelaxable())
+      Sec->setLinkerRelaxable();
     // Do not add data after a linker-relaxable instruction. The difference
     // between a new label and a label at or before the linker-relaxable
     // instruction cannot be resolved at assemble-time.
-    if (!MarkedLinkerRelaxable) {
-      MarkedLinkerRelaxable = true;
-      getCurrentSectionOnly()->setLinkerRelaxable();
-      newFragment();
-    }
+    F->setLinkerRelaxable();
+    newFragment();
   }
+  F->appendFixups(Fixups);
 }
 
 void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
@@ -526,8 +621,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
 
 void MCObjectStreamer::emitBytes(StringRef Data) {
   MCDwarfLineEntry::make(this, getCurrentSectionOnly());
-  MCFragment *DF = getCurrentFragment();
-  DF->appendContents(ArrayRef(Data.data(), Data.size()));
+  appendContents(ArrayRef(Data.data(), Data.size()));
 }
 
 void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill,
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index eda5e8c..9f64a98 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3413,7 +3413,7 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, uint8_t ValueSize) {
 
   // Check whether we should use optimal code alignment for this .align
   // directive.
-  if (Section->useCodeAlign() && !HasFillExpr) {
+  if (MAI.useCodeAlign(*Section) && !HasFillExpr) {
     getStreamer().emitCodeAlignment(
         Align(Alignment), &getTargetParser().getSTI(), MaxBytesToFill);
   } else {
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index c7c3df3..2e251cc 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -644,8 +644,8 @@ EndStmt:
   }
 
   if (UseLastGroup) {
-    if (const MCSectionELF *Section =
-            cast_or_null<MCSectionELF>(getStreamer().getCurrentSectionOnly()))
+    if (auto *Section = static_cast<const MCSectionELF *>(
+            getStreamer().getCurrentSectionOnly()))
       if (const MCSymbol *Group = Section->getGroup()) {
         GroupName = Group->getName();
         IsComdat = Section->isComdat();
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index f4684e6..780289e 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -4228,8 +4228,7 @@ bool MasmParser::emitAlignTo(int64_t Alignment) {
     // Check whether we should use optimal code alignment for this align
     // directive.
     const MCSection *Section = getStreamer().getCurrentSectionOnly();
-    assert(Section && "must have section to emit alignment");
-    if (Section->useCodeAlign()) {
+    if (MAI.useCodeAlign(*Section)) {
       getStreamer().emitCodeAlignment(Align(Alignment),
                                       &getTargetParser().getSTI(),
                                       /*MaxBytesToEmit=*/0);
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 1f824b8..d97f4f5 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -252,7 +252,7 @@ public:
     if (TypeName == "function") {
       WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
       auto *Current =
-          cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+          static_cast<MCSectionWasm *>(getStreamer().getCurrentSectionOnly());
       if (Current->getGroup())
         WasmSym->setComdat(true);
     } else if (TypeName == "global")
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 023f7f2..4f28267 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -18,12 +18,10 @@
 
 using namespace llvm;
 
-MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText, bool IsBss,
-                     MCSymbol *Begin)
+MCSection::MCSection(StringRef Name, bool IsText, bool IsBss, MCSymbol *Begin)
     : Begin(Begin), HasInstructions(false), IsRegistered(false), IsText(IsText),
-      IsBss(IsBss), LinkerRelaxable(false), Name(Name), Variant(V) {
-  // The initial subsection number is 0. Create a fragment list.
-  CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
+      IsBss(IsBss), LinkerRelaxable(false), Name(Name) {
+  DummyFragment.setParent(this);
 }
 
 MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
deleted file mode 100644
index 5bf1473..0000000
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===- lib/MC/MCSectionCOFF.cpp - COFF Code Section Representation --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-
-using namespace llvm;
-
-// shouldOmitSectionDirective - Decides whether a '.section' directive
-// should be printed before the section name
-bool MCSectionCOFF::shouldOmitSectionDirective(StringRef Name,
-                                               const MCAsmInfo &MAI) const {
-  if (COMDATSymbol || isUnique())
-    return false;
-
-  // FIXME: Does .section .bss/.data/.text work everywhere??
-  if (Name == ".text" || Name == ".data" || Name == ".bss")
-    return true;
-
-  return false;
-}
-
-void MCSectionCOFF::setSelection(int Selection) const {
-  assert(Selection != 0 && "invalid COMDAT selection type");
-  this->Selection = Selection;
-  Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
-}
-
-void MCSectionCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                                         raw_ostream &OS,
-                                         uint32_t Subsection) const {
-  // standard sections don't require the '.section'
-  if (shouldOmitSectionDirective(getName(), MAI)) {
-    OS << '\t' << getName() << '\n';
-    return;
-  }
-
-  OS << "\t.section\t" << getName() << ",\"";
-  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
-    OS << 'd';
-  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
-    OS << 'b';
-  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE)
-    OS << 'x';
-  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE)
-    OS << 'w';
-  else if (getCharacteristics() & COFF::IMAGE_SCN_MEM_READ)
-    OS << 'r';
-  else
-    OS << 'y';
-  if (getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE)
-    OS << 'n';
-  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED)
-    OS << 's';
-  if ((getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) &&
-      !isImplicitlyDiscardable(getName()))
-    OS << 'D';
-  if (getCharacteristics() & COFF::IMAGE_SCN_LNK_INFO)
-    OS << 'i';
-  OS << '"';
-
-  // unique should be tail of .section directive.
-  if (isUnique() && !COMDATSymbol)
-    OS << ",unique," << UniqueID;
-
-  if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
-    if (COMDATSymbol)
-      OS << ",";
-    else
-      OS << "\n\t.linkonce\t";
-    switch (Selection) {
-      case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
-        OS << "one_only";
-        break;
-      case COFF::IMAGE_COMDAT_SELECT_ANY:
-        OS << "discard";
-        break;
-      case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE:
-        OS << "same_size";
-        break;
-      case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
-        OS << "same_contents";
-        break;
-      case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
-        OS << "associative";
-        break;
-      case COFF::IMAGE_COMDAT_SELECT_LARGEST:
-        OS << "largest";
-        break;
-      case COFF::IMAGE_COMDAT_SELECT_NEWEST:
-        OS << "newest";
-        break;
-      default:
-        assert(false && "unsupported COFF selection type");
-        break;
-    }
-    if (COMDATSymbol) {
-      OS << ",";
-      COMDATSymbol->print(OS, &MAI);
-    }
-  }
-
-  if (isUnique() && COMDATSymbol)
-    OS << ",unique," << UniqueID;
-
-  OS << '\n';
-}
-
-bool MCSectionCOFF::useCodeAlign() const { return isText(); }
diff --git a/llvm/lib/MC/MCSectionDXContainer.cpp b/llvm/lib/MC/MCSectionDXContainer.cpp
deleted file mode 100644
index 7eee59d..0000000
--- a/llvm/lib/MC/MCSectionDXContainer.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-//===- lib/MC/MCSectionDXContainer.cpp - DXContainer Section --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionDXContainer.h"
-
-using namespace llvm;
-
-void MCSectionDXContainer::printSwitchToSection(const MCAsmInfo &,
-                                                const Triple &, raw_ostream &,
-                                                uint32_t) const {}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
deleted file mode 100644
index ef33f9c..0000000
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-//===- lib/MC/MCSectionELF.cpp - ELF Code Section Representation ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/Triple.h"
-#include <cassert>
-
-using namespace llvm;
-
-// Decides whether a '.section' directive
-// should be printed before the section name.
-bool MCSectionELF::shouldOmitSectionDirective(StringRef Name,
-                                              const MCAsmInfo &MAI) const {
-  if (isUnique())
-    return false;
-
-  return MAI.shouldOmitSectionDirective(Name);
-}
-
-static void printName(raw_ostream &OS, StringRef Name) {
-  if (Name.find_first_not_of("0123456789_."
-                             "abcdefghijklmnopqrstuvwxyz"
-                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
-    OS << Name;
-    return;
-  }
-  OS << '"';
-  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
-    if (*B == '"') // Unquoted "
-      OS << "\\\"";
-    else if (*B != '\\') // Neither " or backslash
-      OS << *B;
-    else if (B + 1 == E) // Trailing backslash
-      OS << "\\\\";
-    else {
-      OS << B[0] << B[1]; // Quoted character
-      ++B;
-    }
-  }
-  OS << '"';
-}
-
-void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                                        raw_ostream &OS,
-                                        uint32_t Subsection) const {
-  if (shouldOmitSectionDirective(getName(), MAI)) {
-    OS << '\t' << getName();
-    if (Subsection)
-      OS << '\t' << Subsection;
-    OS << '\n';
-    return;
-  }
-
-  OS << "\t.section\t";
-  printName(OS, getName());
-
-  // Handle the weird solaris syntax if desired.
-  if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
-      !(Flags & ELF::SHF_MERGE)) {
-    if (Flags & ELF::SHF_ALLOC)
-      OS << ",#alloc";
-    if (Flags & ELF::SHF_EXECINSTR)
-      OS << ",#execinstr";
-    if (Flags & ELF::SHF_WRITE)
-      OS << ",#write";
-    if (Flags & ELF::SHF_EXCLUDE)
-      OS << ",#exclude";
-    if (Flags & ELF::SHF_TLS)
-      OS << ",#tls";
-    OS << '\n';
-    return;
-  }
-
-  OS << ",\"";
-  if (Flags & ELF::SHF_ALLOC)
-    OS << 'a';
-  if (Flags & ELF::SHF_EXCLUDE)
-    OS << 'e';
-  if (Flags & ELF::SHF_EXECINSTR)
-    OS << 'x';
-  if (Flags & ELF::SHF_WRITE)
-    OS << 'w';
-  if (Flags & ELF::SHF_MERGE)
-    OS << 'M';
-  if (Flags & ELF::SHF_STRINGS)
-    OS << 'S';
-  if (Flags & ELF::SHF_TLS)
-    OS << 'T';
-  if (Flags & ELF::SHF_LINK_ORDER)
-    OS << 'o';
-  if (Flags & ELF::SHF_GROUP)
-    OS << 'G';
-  if (Flags & ELF::SHF_GNU_RETAIN)
-    OS << 'R';
-
-  // If there are os-specific flags, print them.
-  if (T.isOSSolaris())
-    if (Flags & ELF::SHF_SUNW_NODISCARD)
-      OS << 'R';
-
-  // If there are target-specific flags, print them.
-  Triple::ArchType Arch = T.getArch();
-  if (Arch == Triple::xcore) {
-    if (Flags & ELF::XCORE_SHF_CP_SECTION)
-      OS << 'c';
-    if (Flags & ELF::XCORE_SHF_DP_SECTION)
-      OS << 'd';
-  } else if (T.isARM() || T.isThumb()) {
-    if (Flags & ELF::SHF_ARM_PURECODE)
-      OS << 'y';
-  } else if (T.isAArch64()) {
-    if (Flags & ELF::SHF_AARCH64_PURECODE)
-      OS << 'y';
-  } else if (Arch == Triple::hexagon) {
-    if (Flags & ELF::SHF_HEX_GPREL)
-      OS << 's';
-  } else if (Arch == Triple::x86_64) {
-    if (Flags & ELF::SHF_X86_64_LARGE)
-      OS << 'l';
-  }
-
-  OS << '"';
-
-  OS << ',';
-
-  // If comment string is '@', e.g. as on ARM - use '%' instead
-  if (MAI.getCommentString()[0] == '@')
-    OS << '%';
-  else
-    OS << '@';
-
-  if (Type == ELF::SHT_INIT_ARRAY)
-    OS << "init_array";
-  else if (Type == ELF::SHT_FINI_ARRAY)
-    OS << "fini_array";
-  else if (Type == ELF::SHT_PREINIT_ARRAY)
-    OS << "preinit_array";
-  else if (Type == ELF::SHT_NOBITS)
-    OS << "nobits";
-  else if (Type == ELF::SHT_NOTE)
-    OS << "note";
-  else if (Type == ELF::SHT_PROGBITS)
-    OS << "progbits";
-  else if (Type == ELF::SHT_X86_64_UNWIND)
-    OS << "unwind";
-  else if (Type == ELF::SHT_MIPS_DWARF)
-    // Print hex value of the flag while we do not have
-    // any standard symbolic representation of the flag.
-    OS << "0x7000001e";
-  else if (Type == ELF::SHT_LLVM_ODRTAB)
-    OS << "llvm_odrtab";
-  else if (Type == ELF::SHT_LLVM_LINKER_OPTIONS)
-    OS << "llvm_linker_options";
-  else if (Type == ELF::SHT_LLVM_CALL_GRAPH_PROFILE)
-    OS << "llvm_call_graph_profile";
-  else if (Type == ELF::SHT_LLVM_DEPENDENT_LIBRARIES)
-    OS << "llvm_dependent_libraries";
-  else if (Type == ELF::SHT_LLVM_SYMPART)
-    OS << "llvm_sympart";
-  else if (Type == ELF::SHT_LLVM_BB_ADDR_MAP)
-    OS << "llvm_bb_addr_map";
-  else if (Type == ELF::SHT_LLVM_OFFLOADING)
-    OS << "llvm_offloading";
-  else if (Type == ELF::SHT_LLVM_LTO)
-    OS << "llvm_lto";
-  else if (Type == ELF::SHT_LLVM_JT_SIZES)
-    OS << "llvm_jt_sizes";
-  else if (Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
-    OS << "llvm_cfi_jump_table";
-  else
-    OS << "0x" << Twine::utohexstr(Type);
-
-  if (EntrySize) {
-    assert((Flags & ELF::SHF_MERGE) || Type == ELF::SHT_LLVM_CFI_JUMP_TABLE);
-    OS << "," << EntrySize;
-  }
-
-  if (Flags & ELF::SHF_LINK_ORDER) {
-    OS << ",";
-    if (LinkedToSym)
-      printName(OS, LinkedToSym->getName());
-    else
-      OS << '0';
-  }
-
-  if (Flags & ELF::SHF_GROUP) {
-    OS << ",";
-    printName(OS, Group.getPointer()->getName());
-    if (isComdat())
-      OS << ",comdat";
-  }
-
-  if (isUnique())
-    OS << ",unique," << UniqueID;
-
-  OS << '\n';
-
-  if (Subsection) {
-    OS << "\t.subsection\t" << Subsection;
-    OS << '\n';
-  }
-}
-
-bool MCSectionELF::useCodeAlign() const {
-  return getFlags() & ELF::SHF_EXECINSTR;
-}
diff --git a/llvm/lib/MC/MCSectionGOFF.cpp b/llvm/lib/MC/MCSectionGOFF.cpp
deleted file mode 100644
index 8163e5b..0000000
--- a/llvm/lib/MC/MCSectionGOFF.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//===- MCSectionGOFF.cpp - GOFF Code Section Representation ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionGOFF.h"
-#include "llvm/BinaryFormat/GOFF.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-static void emitCATTR(raw_ostream &OS, StringRef Name, GOFF::ESDRmode Rmode,
-                      GOFF::ESDAlignment Alignment,
-                      GOFF::ESDLoadingBehavior LoadBehavior,
-                      GOFF::ESDExecutable Executable, bool IsReadOnly,
-                      uint32_t SortKey, uint8_t FillByteValue,
-                      StringRef PartName) {
-  OS << Name << " CATTR ";
-  OS << "ALIGN(" << static_cast<unsigned>(Alignment) << "),"
-     << "FILL(" << static_cast<unsigned>(FillByteValue) << ")";
-  switch (LoadBehavior) {
-  case GOFF::ESD_LB_Deferred:
-    OS << ",DEFLOAD";
-    break;
-  case GOFF::ESD_LB_NoLoad:
-    OS << ",NOLOAD";
-    break;
-  default:
-    break;
-  }
-  switch (Executable) {
-  case GOFF::ESD_EXE_CODE:
-    OS << ",EXECUTABLE";
-    break;
-  case GOFF::ESD_EXE_DATA:
-    OS << ",NOTEXECUTABLE";
-    break;
-  default:
-    break;
-  }
-  if (IsReadOnly)
-    OS << ",READONLY";
-  if (Rmode != GOFF::ESD_RMODE_None) {
-    OS << ',';
-    OS << "RMODE(";
-    switch (Rmode) {
-    case GOFF::ESD_RMODE_24:
-      OS << "24";
-      break;
-    case GOFF::ESD_RMODE_31:
-      OS << "31";
-      break;
-    case GOFF::ESD_RMODE_64:
-      OS << "64";
-      break;
-    case GOFF::ESD_RMODE_None:
-      break;
-    }
-    OS << ')';
-  }
-  if (SortKey)
-    OS << ",PRIORITY(" << SortKey << ")";
-  if (!PartName.empty())
-    OS << ",PART(" << PartName << ")";
-  OS << '\n';
-}
-
-static void emitXATTR(raw_ostream &OS, StringRef Name,
-                      GOFF::ESDLinkageType Linkage,
-                      GOFF::ESDExecutable Executable,
-                      GOFF::ESDBindingScope BindingScope) {
-  OS << Name << " XATTR ";
-  OS << "LINKAGE(" << (Linkage == GOFF::ESD_LT_OS ? "OS" : "XPLINK") << "),";
-  if (Executable != GOFF::ESD_EXE_Unspecified)
-    OS << "REFERENCE(" << (Executable == GOFF::ESD_EXE_CODE ? "CODE" : "DATA")
-       << "),";
-  if (BindingScope != GOFF::ESD_BSC_Unspecified) {
-    OS << "SCOPE(";
-    switch (BindingScope) {
-    case GOFF::ESD_BSC_Section:
-      OS << "SECTION";
-      break;
-    case GOFF::ESD_BSC_Module:
-      OS << "MODULE";
-      break;
-    case GOFF::ESD_BSC_Library:
-      OS << "LIBRARY";
-      break;
-    case GOFF::ESD_BSC_ImportExport:
-      OS << "EXPORT";
-      break;
-    default:
-      break;
-    }
-    OS << ')';
-  }
-  OS << '\n';
-}
-
-void MCSectionGOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                                         raw_ostream &OS,
-                                         uint32_t Subsection) const {
-  switch (SymbolType) {
-  case GOFF::ESD_ST_SectionDefinition: {
-    OS << Name << " CSECT\n";
-    Emitted = true;
-    break;
-  }
-  case GOFF::ESD_ST_ElementDefinition: {
-    getParent()->printSwitchToSection(MAI, T, OS, Subsection);
-    if (!Emitted) {
-      emitCATTR(OS, Name, EDAttributes.Rmode, EDAttributes.Alignment,
-                EDAttributes.LoadBehavior, GOFF::ESD_EXE_Unspecified,
-                EDAttributes.IsReadOnly, 0, EDAttributes.FillByteValue,
-                StringRef());
-      Emitted = true;
-    } else
-      OS << Name << " CATTR\n";
-    break;
-  }
-  case GOFF::ESD_ST_PartReference: {
-    MCSectionGOFF *ED = getParent();
-    ED->getParent()->printSwitchToSection(MAI, T, OS, Subsection);
-    if (!Emitted) {
-      emitCATTR(OS, ED->getName(), ED->getEDAttributes().Rmode,
-                ED->EDAttributes.Alignment, ED->EDAttributes.LoadBehavior,
-                PRAttributes.Executable, ED->EDAttributes.IsReadOnly,
-                PRAttributes.SortKey, ED->EDAttributes.FillByteValue, Name);
-      emitXATTR(OS, Name, PRAttributes.Linkage, PRAttributes.Executable,
-                PRAttributes.BindingScope);
-      ED->Emitted = true;
-      Emitted = true;
-    } else
-      OS << ED->getName() << " CATTR PART(" << Name << ")\n";
-    break;
-  }
-  default:
-    llvm_unreachable("Wrong section type");
-  }
-}
-\ No newline at end of file
diff --git a/llvm/lib/MC/MCSectionMachO.cpp b/llvm/lib/MC/MCSectionMachO.cpp
index 67453ce..67c8235 100644
--- a/llvm/lib/MC/MCSectionMachO.cpp
+++ b/llvm/lib/MC/MCSectionMachO.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -92,7 +93,7 @@ ENTRY("" /*FIXME*/,          S_ATTR_LOC_RELOC)
 MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
                                unsigned TAA, unsigned reserved2, SectionKind K,
                                MCSymbol *Begin)
-    : MCSection(SV_MachO, Section, K.isText(),
+    : MCSection(Section, K.isText(),
                 MachO::isVirtualSection(TAA & MachO::SECTION_TYPE), Begin),
       TypeAndAttributes(TAA), Reserved2(reserved2) {
   assert(Segment.size() <= 16 && Section.size() <= 16 &&
@@ -105,19 +106,20 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
   }
 }
 
-void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                                          raw_ostream &OS,
-                                          uint32_t Subsection) const {
-  OS << "\t.section\t" << getSegmentName() << ',' << getName();
+void MCAsmInfoDarwin::printSwitchToSection(const MCSection &Section, uint32_t,
+                                           const Triple &T,
+                                           raw_ostream &OS) const {
+  auto &Sec = static_cast<const MCSectionMachO &>(Section);
+  OS << "\t.section\t" << Sec.getSegmentName() << ',' << Sec.getName();
 
   // Get the section type and attributes.
-  unsigned TAA = getTypeAndAttributes();
+  unsigned TAA = Sec.getTypeAndAttributes();
   if (TAA == 0) {
     OS << '\n';
     return;
   }
 
-  MachO::SectionType SectionType = getType();
+  MachO::SectionType SectionType = Sec.getType();
   assert(SectionType <= MachO::LAST_KNOWN_SECTION_TYPE &&
          "Invalid SectionType specified!");
 
@@ -135,8 +137,8 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   if (SectionAttrs == 0) {
     // If we have a S_SYMBOL_STUBS size specified, print it along with 'none' as
     // the attribute specifier.
-    if (Reserved2 != 0)
-      OS << ",none," << Reserved2;
+    if (Sec.Reserved2 != 0)
+      OS << ",none," << Sec.Reserved2;
     OS << '\n';
     return;
   }
@@ -164,15 +166,11 @@ void MCSectionMachO::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   assert(SectionAttrs == 0 && "Unknown section attributes!");
 
   // If we have a S_SYMBOL_STUBS size specified, print it.
-  if (Reserved2 != 0)
-    OS << ',' << Reserved2;
+  if (Sec.Reserved2 != 0)
+    OS << ',' << Sec.Reserved2;
   OS << '\n';
 }
 
-bool MCSectionMachO::useCodeAlign() const {
-  return hasAttribute(MachO::S_ATTR_PURE_INSTRUCTIONS);
-}
-
 /// ParseSectionSpecifier - Parse the section specifier indicated by "Spec".
 /// This is a string that can appear after a .section directive in a mach-o
 /// flavored .s file.  If successful, this fills in the specified Out
diff --git a/llvm/lib/MC/MCSectionWasm.cpp b/llvm/lib/MC/MCSectionWasm.cpp
deleted file mode 100644
index e25af1c..0000000
--- a/llvm/lib/MC/MCSectionWasm.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-//===- lib/MC/MCSectionWasm.cpp - Wasm Code Section Representation --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionWasm.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbolWasm.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-// Decides whether a '.section' directive
-// should be printed before the section name.
-bool MCSectionWasm::shouldOmitSectionDirective(StringRef Name,
-                                               const MCAsmInfo &MAI) const {
-  return MAI.shouldOmitSectionDirective(Name);
-}
-
-static void printName(raw_ostream &OS, StringRef Name) {
-  if (Name.find_first_not_of("0123456789_."
-                             "abcdefghijklmnopqrstuvwxyz"
-                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
-    OS << Name;
-    return;
-  }
-  OS << '"';
-  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
-    if (*B == '"') // Unquoted "
-      OS << "\\\"";
-    else if (*B != '\\') // Neither " or backslash
-      OS << *B;
-    else if (B + 1 == E) // Trailing backslash
-      OS << "\\\\";
-    else {
-      OS << B[0] << B[1]; // Quoted character
-      ++B;
-    }
-  }
-  OS << '"';
-}
-
-void MCSectionWasm::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                                         raw_ostream &OS,
-                                         uint32_t Subsection) const {
-
-  if (shouldOmitSectionDirective(getName(), MAI)) {
-    OS << '\t' << getName();
-    if (Subsection)
-      OS << '\t' << Subsection;
-    OS << '\n';
-    return;
-  }
-
-  OS << "\t.section\t";
-  printName(OS, getName());
-  OS << ",\"";
-
-  if (IsPassive)
-    OS << 'p';
-  if (Group)
-    OS << 'G';
-  if (SegmentFlags & wasm::WASM_SEG_FLAG_STRINGS)
-    OS << 'S';
-  if (SegmentFlags & wasm::WASM_SEG_FLAG_TLS)
-    OS << 'T';
-  if (SegmentFlags & wasm::WASM_SEG_FLAG_RETAIN)
-    OS << 'R';
-
-  OS << '"';
-
-  OS << ',';
-
-  // If comment string is '@', e.g. as on ARM - use '%' instead
-  if (MAI.getCommentString()[0] == '@')
-    OS << '%';
-  else
-    OS << '@';
-
-  // TODO: Print section type.
-
-  if (Group) {
-    OS << ",";
-    printName(OS, Group->getName());
-    OS << ",comdat";
-  }
-
-  if (isUnique())
-    OS << ",unique," << UniqueID;
-
-  OS << '\n';
-
-  if (Subsection)
-    OS << "\t.subsection\t" << Subsection << '\n';
-}
-
-bool MCSectionWasm::useCodeAlign() const { return false; }
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
deleted file mode 100644
index 41043b2..0000000
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//===- lib/MC/MCSectionXCOFF.cpp - XCOFF Code Section Representation ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCSectionXCOFF.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-namespace llvm {
-class MCExpr;
-class Triple;
-} // namespace llvm
-
-using namespace llvm;
-
-MCSectionXCOFF::~MCSectionXCOFF() = default;
-
-void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const {
-  OS << "\t.csect " << QualName->getName() << "," << Log2(getAlign()) << '\n';
-}
-
-void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
-                                          raw_ostream &OS,
-                                          uint32_t Subsection) const {
-  if (getKind().isText()) {
-    if (getMappingClass() != XCOFF::XMC_PR)
-      report_fatal_error("Unhandled storage-mapping class for .text csect");
-
-    printCsectDirective(OS);
-    return;
-  }
-
-  if (getKind().isReadOnly()) {
-    if (getMappingClass() != XCOFF::XMC_RO &&
-        getMappingClass() != XCOFF::XMC_TD)
-      report_fatal_error("Unhandled storage-mapping class for .rodata csect.");
-    printCsectDirective(OS);
-    return;
-  }
-
-  if (getKind().isReadOnlyWithRel()) {
-    if (getMappingClass() != XCOFF::XMC_RW &&
-        getMappingClass() != XCOFF::XMC_RO &&
-        getMappingClass() != XCOFF::XMC_TD)
-      report_fatal_error(
-          "Unexepected storage-mapping class for ReadOnlyWithRel kind");
-    printCsectDirective(OS);
-    return;
-  }
-
-  // Initialized TLS data.
-  if (getKind().isThreadData()) {
-    // We only expect XMC_TL here for initialized TLS data.
-    if (getMappingClass() != XCOFF::XMC_TL)
-      report_fatal_error("Unhandled storage-mapping class for .tdata csect.");
-    printCsectDirective(OS);
-    return;
-  }
-
-  if (getKind().isData()) {
-    switch (getMappingClass()) {
-    case XCOFF::XMC_RW:
-    case XCOFF::XMC_DS:
-    case XCOFF::XMC_TD:
-      printCsectDirective(OS);
-      break;
-    case XCOFF::XMC_TC:
-    case XCOFF::XMC_TE:
-      break;
-    case XCOFF::XMC_TC0:
-      OS << "\t.toc\n";
-      break;
-    default:
-      report_fatal_error(
-          "Unhandled storage-mapping class for .data csect.");
-    }
-    return;
-  }
-
-  if (isCsect() && getMappingClass() == XCOFF::XMC_TD) {
-    // Common csect type (uninitialized storage) does not have to print csect
-    // directive for section switching unless it is local.
-    if (getKind().isCommon() && !getKind().isBSSLocal())
-      return;
-
-    assert(getKind().isBSS() && "Unexpected section kind for toc-data");
-    printCsectDirective(OS);
-    return;
-  }
-  // Common csect type (uninitialized storage) does not have to print csect
-  // directive for section switching.
-  if (isCsect() && getCSectType() == XCOFF::XTY_CM) {
-    assert((getMappingClass() == XCOFF::XMC_RW ||
-            getMappingClass() == XCOFF::XMC_BS ||
-            getMappingClass() == XCOFF::XMC_UL) &&
-           "Generated a storage-mapping class for a common/bss/tbss csect we "
-           "don't "
-           "understand how to switch to.");
-    // Common symbols and local zero-initialized symbols for TLS and Non-TLS are
-    // eligible for .bss/.tbss csect, getKind().isThreadBSS() is used to cover
-    // TLS common and zero-initialized local symbols since linkage type (in the
-    // GlobalVariable) is not accessible in this class.
-    assert((getKind().isBSSLocal() || getKind().isCommon() ||
-            getKind().isThreadBSS()) &&
-           "wrong symbol type for .bss/.tbss csect");
-    // Don't have to print a directive for switching to section for commons and
-    // zero-initialized TLS data. The '.comm' and '.lcomm' directives of the
-    // variable will create the needed csect.
-    return;
-  }
-
-  // Zero-initialized TLS data with weak or external linkage are not eligible to
-  // be put into common csect.
-  if (getKind().isThreadBSS()) {
-    printCsectDirective(OS);
-    return;
-  }
-
-  // XCOFF debug sections.
-  if (getKind().isMetadata() && isDwarfSect()) {
-    OS << "\n\t.dwsect " << format("0x%" PRIx32, *getDwarfSubtypeFlags())
-       << '\n';
-    OS << getName() << ':' << '\n';
-    return;
-  }
-
-  report_fatal_error("Printing for this SectionKind is unimplemented.");
-}
-
-bool MCSectionXCOFF::useCodeAlign() const { return getKind().isText(); }
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index e14a32f..bc73981 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -56,12 +56,11 @@ void MCTargetStreamer::finish() {}
 
 void MCTargetStreamer::emitConstantPools() {}
 
-void MCTargetStreamer::changeSection(const MCSection *CurSection,
-                                     MCSection *Section, uint32_t Subsection,
-                                     raw_ostream &OS) {
-  Section->printSwitchToSection(*Streamer.getContext().getAsmInfo(),
-                                Streamer.getContext().getTargetTriple(), OS,
-                                Subsection);
+void MCTargetStreamer::changeSection(const MCSection *, MCSection *Sec,
+                                     uint32_t Subsection, raw_ostream &OS) {
+  auto &MAI = *Streamer.getContext().getAsmInfo();
+  MAI.printSwitchToSection(*Sec, Subsection,
+                           Streamer.getContext().getTargetTriple(), OS);
 }
 
 void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
@@ -838,8 +837,8 @@ static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID,
   if (TextSec == Context.getObjectFileInfo()->getTextSection())
     return MainCFISec;
 
-  const auto *TextSecCOFF = cast<MCSectionCOFF>(TextSec);
-  auto *MainCFISecCOFF = cast<MCSectionCOFF>(MainCFISec);
+  const auto *TextSecCOFF = static_cast<const MCSectionCOFF *>(TextSec);
+  auto *MainCFISecCOFF = static_cast<MCSectionCOFF *>(MainCFISec);
   unsigned UniqueID = TextSecCOFF->getOrAssignWinCFISectionID(NextWinCFIID);
 
   // If this section is COMDAT, this unwind section should be COMDAT associative
@@ -1314,9 +1313,20 @@ void MCStreamer::emitZerofill(MCSection *, MCSymbol *, uint64_t, Align, SMLoc) {
 }
 void MCStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, Align ByteAlignment) {}
-void MCStreamer::changeSection(MCSection *Section, uint32_t) {
-  CurFrag = &Section->getDummyFragment();
+
+void MCStreamer::changeSection(MCSection *Sec, uint32_t) {
+  CurFrag = &Sec->getDummyFragment();
+  auto *Sym = Sec->getBeginSymbol();
+  if (!Sym || !Sym->isUndefined())
+    return;
+  // In Mach-O, DWARF sections use Begin as a temporary label, requiring a label
+  // definition, unlike section symbols in other file formats.
+  if (getContext().getObjectFileType() == MCContext::IsMachO)
+    emitLabel(Sym);
+  else
+    Sym->setFragment(CurFrag);
 }
+
 void MCStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
 void MCStreamer::emitBytes(StringRef Data) {}
 void MCStreamer::emitBinaryData(StringRef Data) { emitBytes(Data); }
@@ -1358,9 +1368,6 @@ void MCStreamer::switchSection(MCSection *Section, uint32_t Subsection) {
     changeSection(Section, Subsection);
     SectionStack.back().first = MCSectionSubPair(Section, Subsection);
     assert(!Section->hasEnded() && "Section already ended");
-    MCSymbol *Sym = Section->getBeginSymbol();
-    if (Sym && !Sym->isInSection())
-      emitLabel(Sym);
   }
 }
 
@@ -1387,9 +1394,6 @@ void MCStreamer::switchSectionNoPrint(MCSection *Section) {
   SectionStack.back().second = SectionStack.back().first;
   SectionStack.back().first = MCSectionSubPair(Section, 0);
   changeSection(Section, 0);
-  MCSymbol *Sym = Section->getBeginSymbol();
-  if (Sym && !Sym->isInSection())
-    emitLabel(Sym);
 }
 
 MCSymbol *MCStreamer::endSection(MCSection *Section) {
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index 5891420c..e3ef111 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -58,7 +58,7 @@ void MCWasmStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment &F,
 
 void MCWasmStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   MCAssembler &Asm = getAssembler();
-  auto *SectionWasm = cast<MCSectionWasm>(Section);
+  auto *SectionWasm = static_cast<const MCSectionWasm *>(Section);
   const MCSymbol *Grp = SectionWasm->getGroup();
   if (Grp)
     Asm.registerSymbol(*Grp);
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 72a8dd7..a87648a 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -318,6 +318,9 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
 
   // Emit the epilog instructions.
   if (EnableUnwindV2) {
+    // Ensure the fixups and appended content apply to the same fragment.
+    OS->ensureHeadroom(info->EpilogMap.size() * 2);
+
     bool IsLast = true;
     for (const auto &Epilog : llvm::reverse(info->EpilogMap)) {
       if (IsLast) {
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 9369bea..8be5054 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -157,7 +157,8 @@ void MCWinCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
   // Ensure that the first and the second symbols relative to the section are
   // the section symbol and the COMDAT symbol.
   getAssembler().registerSymbol(*Section->getBeginSymbol());
-  if (auto *Sym = cast<MCSectionCOFF>(Section)->getCOMDATSymbol())
+  if (auto *Sym =
+          static_cast<const MCSectionCOFF *>(Section)->getCOMDATSymbol())
     getAssembler().registerSymbol(*Sym);
 }
 
@@ -279,6 +280,7 @@ void MCWinCOFFStreamer::emitCOFFSymbolIndex(MCSymbol const *Symbol) {
 void MCWinCOFFStreamer::emitCOFFSectionIndex(const MCSymbol *Symbol) {
   visitUsedSymbol(*Symbol);
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
+  ensureHeadroom(2);
   addFixup(SRE, FK_SecRel_2);
   appendContents(2, 0);
 }
@@ -292,6 +294,7 @@ void MCWinCOFFStreamer::emitCOFFSecRel32(const MCSymbol *Symbol,
   if (Offset)
     MCE = MCBinaryExpr::createAdd(
         MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+  ensureHeadroom(4);
   addFixup(MCE, FK_SecRel_4);
   // Emit 4 bytes (zeros) to the object file.
   appendContents(4, 0);
@@ -307,6 +310,7 @@ void MCWinCOFFStreamer::emitCOFFImgRel32(const MCSymbol *Symbol,
   if (Offset)
     MCE = MCBinaryExpr::createAdd(
         MCE, MCConstantExpr::create(Offset, getContext()), getContext());
+  ensureHeadroom(4);
   addFixup(MCE, FK_Data_4);
   // Emit 4 bytes (zeros) to the object file.
   appendContents(4, 0);
@@ -317,6 +321,7 @@ void MCWinCOFFStreamer::emitCOFFSecNumber(MCSymbol const *Symbol) {
   // Create Symbol for section number.
   const MCExpr *MCE = MCCOFFSectionNumberTargetExpr::create(
       *Symbol, this->getWriter(), getContext());
+  ensureHeadroom(4);
   addFixup(MCE, FK_Data_4);
   // Emit 4 bytes (zeros) to the object file.
   appendContents(4, 0);
@@ -327,6 +332,7 @@ void MCWinCOFFStreamer::emitCOFFSecOffset(MCSymbol const *Symbol) {
   // Create Symbol for section offset.
   const MCExpr *MCE =
       MCCOFFSectionOffsetTargetExpr::create(*Symbol, getContext());
+  ensureHeadroom(4);
   addFixup(MCE, FK_Data_4);
   // Emit 4 bytes (zeros) to the object file.
   appendContents(4, 0);
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 63381b4..26f45ce 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -36,6 +36,20 @@ XCOFFObjectWriter &MCXCOFFStreamer::getWriter() {
   return static_cast<XCOFFObjectWriter &>(getAssembler().getWriter());
 }
 
+void MCXCOFFStreamer::changeSection(MCSection *Section, uint32_t Subsection) {
+  MCObjectStreamer::changeSection(Section, Subsection);
+  auto *Sec = static_cast<const MCSectionXCOFF *>(Section);
+  // We might miss calculating the symbols difference as absolute value before
+  // adding fixups when symbol_A without the fragment set is the csect itself
+  // and symbol_B is in it.
+  // TODO: Currently we only set the fragment for XMC_PR csects and DWARF
+  // sections because we don't have other cases that hit this problem yet.
+  // if (IsDwarfSec || CsectProp->MappingClass == XCOFF::XMC_PR)
+  //   QualName->setFragment(F);
+  if (Sec->isDwarfSect() || Sec->getMappingClass() == XCOFF::XMC_PR)
+    Sec->getQualNameSymbol()->setFragment(CurFrag);
+}
+
 bool MCXCOFFStreamer::emitSymbolAttribute(MCSymbol *Sym,
                                           MCSymbolAttr Attribute) {
   auto *Symbol = cast<MCSymbolXCOFF>(Sym);
@@ -89,16 +103,8 @@ void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
 void MCXCOFFStreamer::emitXCOFFRefDirective(const MCSymbol *Symbol) {
   // Add a Fixup here to later record a relocation of type R_REF to prevent the
   // ref symbol from being garbage collected (by the binder).
-  MCFragment *DF = getCurrentFragment();
-  const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
-  std::optional<MCFixupKind> MaybeKind =
-      getAssembler().getBackend().getFixupKind("R_REF");
-  if (!MaybeKind)
-    report_fatal_error("failed to get fixup kind for R_REF relocation");
-
-  MCFixupKind Kind = *MaybeKind;
-  MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, Kind);
-  DF->addFixup(Fixup);
+  addFixup(MCSymbolRefExpr::create(Symbol, getContext()),
+           XCOFF::RelocationType::R_REF);
 }
 
 void MCXCOFFStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 48d2fc6..e87696a 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -126,7 +126,8 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S) const {
 uint64_t MachObjectWriter::getPaddingSize(const MCAssembler &Asm,
                                           const MCSection *Sec) const {
   uint64_t EndAddr = getSectionAddress(Sec) + Asm.getSectionAddressSize(*Sec);
-  unsigned Next = cast<MCSectionMachO>(Sec)->getLayoutOrder() + 1;
+  unsigned Next =
+      static_cast<const MCSectionMachO *>(Sec)->getLayoutOrder() + 1;
   if (Next >= SectionOrder.size())
     return 0;
 
@@ -259,15 +260,12 @@ void MachObjectWriter::writeSegmentLoadCommand(
 }
 
 void MachObjectWriter::writeSection(const MCAssembler &Asm,
-                                    const MCSection &Sec, uint64_t VMAddr,
+                                    const MCSectionMachO &Sec, uint64_t VMAddr,
                                     uint64_t FileOffset, unsigned Flags,
                                     uint64_t RelocationsStart,
                                     unsigned NumRelocations) {
-  uint64_t SectionSize = Asm.getSectionAddressSize(Sec);
-  const MCSectionMachO &Section = cast<MCSectionMachO>(Sec);
-
   // The offset is unused for virtual sections.
-  if (Section.isBssSection()) {
+  if (Sec.isBssSection()) {
     assert(Asm.getSectionFileSize(Sec) == 0 && "Invalid file size!");
     FileOffset = 0;
   }
@@ -275,11 +273,11 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
   // struct section (68 bytes) or
   // struct section_64 (80 bytes)
 
+  uint64_t SectionSize = Asm.getSectionAddressSize(Sec);
   uint64_t Start = W.OS.tell();
   (void) Start;
-
-  writeWithPadding(Section.getName(), 16);
-  writeWithPadding(Section.getSegmentName(), 16);
+  writeWithPadding(Sec.getName(), 16);
+  writeWithPadding(Sec.getSegmentName(), 16);
   if (is64Bit()) {
     W.write<uint64_t>(VMAddr);      // address
     W.write<uint64_t>(SectionSize); // size
@@ -290,14 +288,14 @@ void MachObjectWriter::writeSection(const MCAssembler &Asm,
   assert(isUInt<32>(FileOffset) && "Cannot encode offset of section");
   W.write<uint32_t>(FileOffset);
 
-  W.write<uint32_t>(Log2(Section.getAlign()));
+  W.write<uint32_t>(Log2(Sec.getAlign()));
   assert((!NumRelocations || isUInt<32>(RelocationsStart)) &&
          "Cannot encode offset of relocations");
   W.write<uint32_t>(NumRelocations ? RelocationsStart : 0);
   W.write<uint32_t>(NumRelocations);
   W.write<uint32_t>(Flags);
   W.write<uint32_t>(IndirectSymBase.lookup(&Sec)); // reserved1
-  W.write<uint32_t>(Section.getStubSize()); // reserved2
+  W.write<uint32_t>(Sec.getStubSize());            // reserved2
   if (is64Bit())
     W.write<uint32_t>(0); // reserved3
 
@@ -531,7 +529,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
   // Report errors for use of .indirect_symbol not in a symbol pointer section
   // or stub section.
   for (IndirectSymbolData &ISD : IndirectSymbols) {
-    const MCSectionMachO &Section = cast<MCSectionMachO>(*ISD.Section);
+    const MCSectionMachO &Section = static_cast<MCSectionMachO &>(*ISD.Section);
 
     if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
         Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS &&
@@ -545,7 +543,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
 
   // Bind non-lazy symbol pointers first.
   for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) {
-    const auto &Section = cast<MCSectionMachO>(*ISD.Section);
+    const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section);
 
     if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS &&
         Section.getType() !=  MachO::S_THREAD_LOCAL_VARIABLE_POINTERS)
@@ -559,7 +557,7 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) {
 
   // Then lazy symbol pointers and symbol stubs.
   for (auto [IndirectIndex, ISD] : enumerate(IndirectSymbols)) {
-    const auto &Section = cast<MCSectionMachO>(*ISD.Section);
+    const auto &Section = static_cast<MCSectionMachO &>(*ISD.Section);
 
     if (Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS &&
         Section.getType() != MachO::S_SYMBOL_STUBS)
@@ -684,13 +682,13 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm) {
   for (MCSection &Sec : Asm) {
     if (!Sec.isBssSection()) {
       SectionOrder.push_back(&Sec);
-      cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
+      static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++);
     }
   }
   for (MCSection &Sec : Asm) {
     if (Sec.isBssSection()) {
       SectionOrder.push_back(&Sec);
-      cast<MCSectionMachO>(Sec).setLayoutOrder(i++);
+      static_cast<MCSectionMachO &>(Sec).setLayoutOrder(i++);
     }
   }
 
@@ -808,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() {
     }
     MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0,
                                                   SectionKind::getMetadata());
-    llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data());
+    llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data());
   }
 
   unsigned NumSections = Asm.end() - Asm.begin();
@@ -907,7 +905,7 @@ uint64_t MachObjectWriter::writeObject() {
   // ... and then the section headers.
   uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
   for (const MCSection &Section : Asm) {
-    const auto &Sec = cast<MCSectionMachO>(Section);
+    const auto &Sec = static_cast<const MCSectionMachO &>(Section);
     std::vector<RelAndSymbol> &Relocs = Relocations[&Sec];
     unsigned NumRelocs = Relocs.size();
     uint64_t SectionStart = SectionDataStart + getSectionAddress(&Sec);
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 3b99af4..bfd6334 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -480,7 +480,7 @@ void WasmObjectWriter::recordRelocation(const MCFragment &F,
   // The WebAssembly backend should never generate FKF_IsPCRel fixups
   assert(!Fixup.isPCRel());
 
-  const auto &FixupSection = cast<MCSectionWasm>(*F.getParent());
+  const auto &FixupSection = static_cast<MCSectionWasm &>(*F.getParent());
   uint64_t C = Target.getConstant();
   uint64_t FixupOffset = Asm->getFragmentOffset(F) + Fixup.getOffset();
   MCContext &Ctx = getContext();
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 6ad4334..856850d 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -373,7 +373,7 @@ void WinCOFFWriter::defineSymbol(const MCSymbol &MCSym) {
   COFFSection *Sec = nullptr;
   MCSectionCOFF *MCSec = nullptr;
   if (Base && Base->getFragment()) {
-    MCSec = cast<MCSectionCOFF>(Base->getFragment()->getParent());
+    MCSec = static_cast<MCSectionCOFF *>(Base->getFragment()->getParent());
     Sec = SectionMap[MCSec];
   }
 
@@ -1057,7 +1057,8 @@ uint64_t WinCOFFWriter::writeObject() {
       continue;
     }
 
-    const auto *AssocMCSec = cast<MCSectionCOFF>(&AssocMCSym->getSection());
+    const auto *AssocMCSec =
+        static_cast<const MCSectionCOFF *>(&AssocMCSym->getSection());
     assert(SectionMap.count(AssocMCSec));
     COFFSection *AssocSec = SectionMap[AssocMCSec];
 
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 2f6785f..65f543b 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -550,13 +550,13 @@ CsectGroup &XCOFFWriter::getCsectGroup(const MCSectionXCOFF *MCSec) {
 
 static MCSectionXCOFF *getContainingCsect(const MCSymbolXCOFF *XSym) {
   if (XSym->isDefined())
-    return cast<MCSectionXCOFF>(XSym->getFragment()->getParent());
+    return static_cast<MCSectionXCOFF *>(XSym->getFragment()->getParent());
   return XSym->getRepresentedCsect();
 }
 
 void XCOFFWriter::executePostLayoutBinding() {
   for (const auto &S : *Asm) {
-    const auto *MCSec = cast<const MCSectionXCOFF>(&S);
+    auto *MCSec = static_cast<const MCSectionXCOFF *>(&S);
     assert(!SectionMap.contains(MCSec) && "Cannot add a section twice.");
 
     // If the name does not fit in the storage provided in the symbol table
@@ -747,7 +747,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup,
       FixedValue = TOCEntryOffset;
     }
   } else if (Type == XCOFF::RelocationType::R_RBR) {
-    MCSectionXCOFF *ParentSec = cast<MCSectionXCOFF>(F.getParent());
+    auto *ParentSec = static_cast<MCSectionXCOFF *>(F.getParent());
     assert((SymASec->getMappingClass() == XCOFF::XMC_PR &&
             ParentSec->getMappingClass() == XCOFF::XMC_PR) &&
            "Only XMC_PR csect may have the R_RBR relocation.");
@@ -768,7 +768,7 @@ void XCOFFWriter::recordRelocation(const MCFragment &F, const MCFixup &Fixup,
   }
 
   XCOFFRelocation Reloc = {Index, FixupOffsetInCsect, SignAndSize, Type};
-  MCSectionXCOFF *RelocationSec = cast<MCSectionXCOFF>(F.getParent());
+  auto *RelocationSec = static_cast<MCSectionXCOFF *>(F.getParent());
   assert(SectionMap.contains(RelocationSec) &&
          "Expected containing csect to exist in map.");
   SectionMap[RelocationSec]->Relocations.push_back(Reloc);
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 2579fa3..0f19495 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -8,11 +8,11 @@
 
 #include "llvm/Object/IRSymtab.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Comdat.h"
@@ -213,9 +213,10 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
   return P.first->second;
 }
 
-static DenseSet<StringRef> buildPreservedSymbolsSet(const Triple &TT) {
-  DenseSet<StringRef> PreservedSymbolSet(std::begin(PreservedSymbols),
-                                         std::end(PreservedSymbols));
+static StringSet<> buildPreservedSymbolsSet(const Triple &TT) {
+  StringSet<> PreservedSymbolSet;
+  PreservedSymbolSet.insert(std::begin(PreservedSymbols),
+                            std::end(PreservedSymbols));
   // FIXME: Do we need to pass in ABI fields from TargetOptions?
   RTLIB::RuntimeLibcallsInfo Libcalls(TT);
   for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) {
@@ -280,7 +281,7 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
 
   setStr(Sym.IRName, GV->getName());
 
-  static const DenseSet<StringRef> PreservedSymbolsSet =
+  static const StringSet<> PreservedSymbolsSet =
       buildPreservedSymbolsSet(GV->getParent()->getTargetTriple());
   bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName());
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index bb7ccdb..1b111dc 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -84,6 +84,7 @@ MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
 MODULE_PASS("globalopt", GlobalOptPass())
 MODULE_PASS("globalsplit", GlobalSplitPass())
 MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass())
+MODULE_PASS("hipstdpar-math-fixup", HipStdParMathFixupPass())
 MODULE_PASS("hipstdpar-select-accelerator-code",
             HipStdParAcceleratorCodeSelectionPass())
 MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
@@ -119,7 +120,6 @@ MODULE_PASS("module-inline", ModuleInlinerPass())
 MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
 MODULE_PASS("nsan", NumericalStabilitySanitizerPass())
-MODULE_PASS("objc-arc-apelim", ObjCARCAPElimPass())
 MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("openmp-opt-postlink",
             OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp
index 5bb04d0..b6f338f 100644
--- a/llvm/lib/Support/Debug.cpp
+++ b/llvm/lib/Support/Debug.cpp
@@ -24,11 +24,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/circular_raw_ostream.h"
 #include "llvm/Support/raw_ostream.h"
+#include <utility>
 
 #include "DebugOptions.h"
 
@@ -38,27 +40,62 @@
 
 using namespace llvm;
 
+/// Parse a debug type string into a pair of the debug type and the debug level.
+/// The expected format is "type[:level]", where the level is an optional
+/// integer.
+static std::pair<std::string, std::optional<int>>
+parseDebugType(StringRef DbgType) {
+  std::optional<int> Level;
+  size_t ColonPos = DbgType.find(':');
+  if (ColonPos != StringRef::npos) {
+    StringRef LevelStr = DbgType.substr(ColonPos + 1);
+    DbgType = DbgType.take_front(ColonPos);
+    if (LevelStr.empty())
+      Level = 0;
+    else {
+      int parsedLevel;
+      if (to_integer(LevelStr, parsedLevel, 10))
+        Level = parsedLevel;
+    }
+  }
+  return std::make_pair(DbgType.str(), Level);
+}
+
 // Even though LLVM might be built with NDEBUG, define symbols that the code
 // built without NDEBUG can depend on via the llvm/Support/Debug.h header.
 namespace llvm {
 /// Exported boolean set by the -debug option.
 bool DebugFlag = false;
 
-static ManagedStatic<std::vector<std::string>> CurrentDebugType;
+/// The current debug type and an optional debug level.
+/// The debug level is the verbosity of the debug output.
+/// 0 is a special level that acts as an opt-out for this specific debug type.
+/// If provided, the debug output is enabled only if the user specified a level
+/// at least as high as the provided level.
+static ManagedStatic<std::vector<std::pair<std::string, std::optional<int>>>>
+    CurrentDebugType;
 
 /// Return true if the specified string is the debug type
 /// specified on the command line, or if none was specified on the command line
 /// with the -debug-only=X option.
-bool isCurrentDebugType(const char *DebugType) {
+bool isCurrentDebugType(const char *DebugType, int Level) {
   if (CurrentDebugType->empty())
     return true;
+  // Track if there is at least one debug type with a level, this is used
+  // to allow to opt-out of some DebugType and leaving all the others enabled.
+  bool HasEnabledDebugType = false;
   // See if DebugType is in list. Note: do not use find() as that forces us to
   // unnecessarily create an std::string instance.
-  for (auto &d : *CurrentDebugType) {
-    if (d == DebugType)
+  for (auto &D : *CurrentDebugType) {
+    HasEnabledDebugType =
+        HasEnabledDebugType || (!D.second.has_value() || D.second.value() > 0);
+    if (D.first != DebugType)
+      continue;
+    if (!D.second.has_value())
       return true;
+    return D.second >= Level;
   }
-  return false;
+  return !HasEnabledDebugType;
 }
 
 /// Set the current debug type, as if the -debug-only=X
@@ -73,8 +110,11 @@ void setCurrentDebugType(const char *Type) {
 
 void setCurrentDebugTypes(const char **Types, unsigned Count) {
   CurrentDebugType->clear();
-  llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count));
+  CurrentDebugType->reserve(Count);
+  for (const char *Type : ArrayRef(Types, Count))
+    CurrentDebugType->push_back(parseDebugType(Type));
 }
+
 } // namespace llvm
 
 // All Debug.h functionality is a no-op in NDEBUG mode.
@@ -114,10 +154,10 @@ struct DebugOnlyOpt {
     if (Val.empty())
       return;
     DebugFlag = true;
-    SmallVector<StringRef,8> dbgTypes;
-    StringRef(Val).split(dbgTypes, ',', -1, false);
-    for (auto dbgType : dbgTypes)
-      CurrentDebugType->push_back(std::string(dbgType));
+    SmallVector<StringRef, 8> DbgTypes;
+    StringRef(Val).split(DbgTypes, ',', -1, false);
+    for (auto DbgType : DbgTypes)
+      CurrentDebugType->push_back(parseDebugType(DbgType));
   }
 };
 } // namespace
@@ -129,8 +169,13 @@ struct CreateDebugOnly {
   static void *call() {
     return new cl::opt<DebugOnlyOpt, true, cl::parser<std::string>>(
         "debug-only",
-        cl::desc("Enable a specific type of debug output (comma separated list "
-                 "of types)"),
+        cl::desc(
+            "Enable a specific type of debug output (comma separated list "
+            "of types using the format \"type[:level]\", where the level "
+            "is an optional integer. The level can be set to 1, 2, 3, etc. to "
+            "control the verbosity of the output. Setting a debug-type level "
+            "to zero acts as an opt-out for this specific debug-type without "
+            "affecting the others."),
         cl::Hidden, cl::value_desc("debug string"),
         cl::location(DebugOnlyOptLoc), cl::ValueRequired);
   }
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 1f3e5dc..3f318e2 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -985,6 +985,12 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const {
     }
     break;
 
+  case GETDAGOPNAME:
+    if (const auto *Dag = dyn_cast<DagInit>(LHS)) {
+      return Dag->getName();
+    }
+    break;
+
   case LOG2:
     if (const auto *LHSi = dyn_cast_or_null<IntInit>(
             LHS->convertInitializerTo(IntRecTy::get(RK)))) {
@@ -1050,6 +1056,9 @@ std::string UnOpInit::getAsString() const {
   case SIZE: Result = "!size"; break;
   case EMPTY: Result = "!empty"; break;
   case GETDAGOP: Result = "!getdagop"; break;
+  case GETDAGOPNAME:
+    Result = "!getdagopname";
+    break;
   case LOG2 : Result = "!logtwo"; break;
   case LISTFLATTEN:
     Result = "!listflatten";
@@ -1310,7 +1319,11 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
       SmallVector<std::pair<const Init *, const StringInit *>, 8> Args;
       llvm::append_range(Args, LHSs->getArgAndNames());
       llvm::append_range(Args, RHSs->getArgAndNames());
-      return DagInit::get(Op, Args);
+      // Use the name of the LHS DAG if it's set, otherwise the name of the RHS.
+      const auto *NameInit = LHSs->getName();
+      if (!NameInit)
+        NameInit = RHSs->getName();
+      return DagInit::get(Op, NameInit, Args);
     }
     break;
   }
@@ -1508,6 +1521,14 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
       return DagInit::get(Op, Dag->getArgs(), Dag->getArgNames());
     break;
   }
+  case SETDAGOPNAME: {
+    const auto *Dag = dyn_cast<DagInit>(LHS);
+    const auto *Op = dyn_cast<StringInit>(RHS);
+    if (Dag && Op)
+      return DagInit::get(Dag->getOperator(), Op, Dag->getArgs(),
+                          Dag->getArgNames());
+    break;
+  }
   case ADD:
   case SUB:
   case MUL:
@@ -1620,6 +1641,9 @@ std::string BinOpInit::getAsString() const {
   case STRCONCAT: Result = "!strconcat"; break;
   case INTERLEAVE: Result = "!interleave"; break;
   case SETDAGOP: Result = "!setdagop"; break;
+  case SETDAGOPNAME:
+    Result = "!setdagopname";
+    break;
   case GETDAGARG:
     Result = "!getdagarg<" + getType()->getAsString() + ">";
     break;
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index aea1bb0..c369916 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -680,6 +680,8 @@ tgtok::TokKind TGLexer::LexExclaim() {
           .Case("find", tgtok::XFind)
           .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
           .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
+          .Case("setdagopname", tgtok::XSetDagOpName)
+          .Case("getdagopname", tgtok::XGetDagOpName)
           .Case("getdagarg", tgtok::XGetDagArg)
           .Case("getdagname", tgtok::XGetDagName)
           .Case("setdagarg", tgtok::XSetDagArg)
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index ed7d8f3..5725e39 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -150,6 +150,8 @@ enum TokKind {
   XGt,
   XSetDagOp,
   XGetDagOp,
+  XSetDagOpName,
+  XGetDagOpName,
   XExists,
   XListRemove,
   XToLower,
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 62c5355..81b61b1 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TGParser.h"
+#include "TGLexer.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -1199,6 +1200,7 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
   case tgtok::XCast:
   case tgtok::XRepr:
   case tgtok::XGetDagOp:
+  case tgtok::XGetDagOpName:
   case tgtok::XInitialized: { // Value ::= !unop '(' Value ')'
     UnOpInit::UnaryOp Code;
     const RecTy *Type = nullptr;
@@ -1287,6 +1289,11 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
       }
       Code = UnOpInit::GETDAGOP;
       break;
+    case tgtok::XGetDagOpName:
+      Lex.Lex(); // eat the operation
+      Type = StringRecTy::get(Records);
+      Code = UnOpInit::GETDAGOPNAME;
+      break;
     case tgtok::XInitialized:
       Lex.Lex(); // eat the operation
       Code = UnOpInit::INITIALIZED;
@@ -1514,7 +1521,8 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
   case tgtok::XInterleave:
   case tgtok::XGetDagArg:
   case tgtok::XGetDagName:
-  case tgtok::XSetDagOp: { // Value ::= !binop '(' Value ',' Value ')'
+  case tgtok::XSetDagOp:
+  case tgtok::XSetDagOpName: { // Value ::= !binop '(' Value ',' Value ')'
     tgtok::TokKind OpTok = Lex.getCode();
     SMLoc OpLoc = Lex.getLoc();
     Lex.Lex();  // eat the operation
@@ -1550,6 +1558,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
     case tgtok::XStrConcat:  Code = BinOpInit::STRCONCAT; break;
     case tgtok::XInterleave: Code = BinOpInit::INTERLEAVE; break;
     case tgtok::XSetDagOp:   Code = BinOpInit::SETDAGOP; break;
+    case tgtok::XSetDagOpName:
+      Code = BinOpInit::SETDAGOPNAME;
+      break;
     case tgtok::XGetDagArg:
       Code = BinOpInit::GETDAGARG;
       break;
@@ -1580,6 +1591,10 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
       }
       ArgType = DagRecTy::get(Records);
       break;
+    case tgtok::XSetDagOpName:
+      Type = DagRecTy::get(Records);
+      ArgType = DagRecTy::get(Records);
+      break;
     case tgtok::XGetDagName:
       Type = StringRecTy::get(Records);
       ArgType = DagRecTy::get(Records);
@@ -1773,22 +1788,26 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
       // Deal with BinOps whose arguments have different types, by
       // rewriting ArgType in between them.
       switch (Code) {
-        case BinOpInit::SETDAGOP:
-          // After parsing the first dag argument, switch to expecting
-          // a record, with no restriction on its superclasses.
-          ArgType = RecordRecTy::get(Records, {});
-          break;
-        case BinOpInit::GETDAGARG:
-          // After parsing the first dag argument, expect an index integer or a
-          // name string.
-          ArgType = nullptr;
-          break;
-        case BinOpInit::GETDAGNAME:
-          // After parsing the first dag argument, expect an index integer.
-          ArgType = IntRecTy::get(Records);
-          break;
-        default:
-          break;
+      case BinOpInit::SETDAGOPNAME:
+        // After parsing the first dag argument, expect a string.
+        ArgType = StringRecTy::get(Records);
+        break;
+      case BinOpInit::SETDAGOP:
+        // After parsing the first dag argument, switch to expecting
+        // a record, with no restriction on its superclasses.
+        ArgType = RecordRecTy::get(Records, {});
+        break;
+      case BinOpInit::GETDAGARG:
+        // After parsing the first dag argument, expect an index integer or a
+        // name string.
+        ArgType = nullptr;
+        break;
+      case BinOpInit::GETDAGNAME:
+        // After parsing the first dag argument, expect an index integer.
+        ArgType = IntRecTy::get(Records);
+        break;
+      default:
+        break;
       }
 
       if (!consume(tgtok::comma))
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c4b43e1..c52487a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -176,6 +176,9 @@ public:
                              std::optional<AArch64PACKey::ID> PACKey,
                              uint64_t PACDisc, Register PACAddrDisc);
 
+  // Emit the sequence for PAC.
+  void emitPtrauthSign(const MachineInstr *MI);
+
   // Emit the sequence to compute the discriminator.
   //
   // The returned register is either unmodified AddrDisc or ScratchReg.
@@ -2175,6 +2178,37 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(
     OutStreamer->emitLabel(EndSym);
 }
 
+void AArch64AsmPrinter::emitPtrauthSign(const MachineInstr *MI) {
+  Register Val = MI->getOperand(1).getReg();
+  auto Key = (AArch64PACKey::ID)MI->getOperand(2).getImm();
+  uint64_t Disc = MI->getOperand(3).getImm();
+  Register AddrDisc = MI->getOperand(4).getReg();
+  bool AddrDiscKilled = MI->getOperand(4).isKill();
+
+  // As long as at least one of Val and AddrDisc is in GPR64noip, a scratch
+  // register is available.
+  Register ScratchReg = Val == AArch64::X16 ? AArch64::X17 : AArch64::X16;
+  assert(ScratchReg != AddrDisc &&
+         "Neither X16 nor X17 is available as a scratch register");
+
+  // Compute pac discriminator
+  assert(isUInt<16>(Disc));
+  Register DiscReg = emitPtrauthDiscriminator(
+      Disc, AddrDisc, ScratchReg, /*MayUseAddrAsScratch=*/AddrDiscKilled);
+  bool IsZeroDisc = DiscReg == AArch64::XZR;
+  unsigned Opc = getPACOpcodeForKey(Key, IsZeroDisc);
+
+  //  paciza x16      ; if  IsZeroDisc
+  //  pacia x16, x17  ; if !IsZeroDisc
+  MCInst PACInst;
+  PACInst.setOpcode(Opc);
+  PACInst.addOperand(MCOperand::createReg(Val));
+  PACInst.addOperand(MCOperand::createReg(Val));
+  if (!IsZeroDisc)
+    PACInst.addOperand(MCOperand::createReg(DiscReg));
+  EmitToStreamer(*OutStreamer, PACInst);
+}
+
 void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   bool IsCall = MI->getOpcode() == AArch64::BLRA;
   unsigned BrTarget = MI->getOperand(0).getReg();
@@ -2890,6 +2924,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
         MI->getOperand(4).getImm(), MI->getOperand(5).getReg());
     return;
 
+  case AArch64::PAC:
+    emitPtrauthSign(MI);
+    return;
+
   case AArch64::LOADauthptrstatic:
     LowerLOADauthptrstatic(*MI);
     return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ef3e8c8..7c9fc67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3101,6 +3101,83 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
   return BB;
 }
 
+// Helper function to find the instruction that defined a virtual register.
+// If unable to find such instruction, returns nullptr.
+static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,
+                                           Register Reg) {
+  while (Reg.isVirtual()) {
+    MachineInstr *DefMI = MRI.getVRegDef(Reg);
+    assert(DefMI && "Virtual register definition not found");
+    unsigned Opcode = DefMI->getOpcode();
+
+    if (Opcode == AArch64::COPY) {
+      Reg = DefMI->getOperand(1).getReg();
+      // Vreg is defined by copying from physreg.
+      if (Reg.isPhysical())
+        return DefMI;
+      continue;
+    }
+    if (Opcode == AArch64::SUBREG_TO_REG) {
+      Reg = DefMI->getOperand(2).getReg();
+      continue;
+    }
+
+    return DefMI;
+  }
+  return nullptr;
+}
+
+void AArch64TargetLowering::fixupPtrauthDiscriminator(
+    MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp,
+    MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  Register AddrDisc = AddrDiscOp.getReg();
+  int64_t IntDisc = IntDiscOp.getImm();
+  assert(IntDisc == 0 && "Blend components are already expanded");
+
+  const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
+  if (DiscMI) {
+    switch (DiscMI->getOpcode()) {
+    case AArch64::MOVKXi:
+      // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
+      // #imm should be an immediate and not a global symbol, for example.
+      if (DiscMI->getOperand(2).isImm() &&
+          DiscMI->getOperand(3).getImm() == 48) {
+        AddrDisc = DiscMI->getOperand(1).getReg();
+        IntDisc = DiscMI->getOperand(2).getImm();
+      }
+      break;
+    case AArch64::MOVi32imm:
+    case AArch64::MOVi64imm:
+      // Small immediate integer constant passed via VReg.
+      if (DiscMI->getOperand(1).isImm() &&
+          isUInt<16>(DiscMI->getOperand(1).getImm())) {
+        AddrDisc = AArch64::NoRegister;
+        IntDisc = DiscMI->getOperand(1).getImm();
+      }
+      break;
+    }
+  }
+
+  // For uniformity, always use NoRegister, as XZR is not necessarily contained
+  // in the requested register class.
+  if (AddrDisc == AArch64::XZR)
+    AddrDisc = AArch64::NoRegister;
+
+  // Make sure AddrDisc operand respects the register class imposed by MI.
+  if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
+    Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
+    BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
+    AddrDisc = TmpReg;
+  }
+
+  AddrDiscOp.setReg(AddrDisc);
+  IntDiscOp.setImm(IntDisc);
+}
+
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -3199,6 +3276,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
   case AArch64::MOVT_TIZ_PSEUDO:
     return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
+
+  case AArch64::PAC:
+    fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
+                              &AArch64::GPR64noipRegClass);
+    return BB;
   }
 }
 
@@ -6814,7 +6896,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
                       DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
       SDValue Result = DAG.getMemIntrinsicNode(
           AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
-          {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+          {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
+           DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
           StoreNode->getMemoryVT(), StoreNode->getMemOperand());
       return Result;
     }
@@ -11242,7 +11325,7 @@ static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
 
 SDValue AArch64TargetLowering::LowerSELECT_CC(
     ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
-    iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
+    iterator_range<SDNode::user_iterator> Users, SDNodeFlags Flags,
     const SDLoc &DL, SelectionDAG &DAG) const {
   // Handle f128 first, because it will result in a comparison of some RTLIB
   // call result against zero.
@@ -11440,7 +11523,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
           return true;
         }
       })) {
-    bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
+    bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
     SDValue VectorCmp =
         emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
     if (VectorCmp)
@@ -11454,7 +11537,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
 
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  if (Flags.hasNoSignedZeros()) {
     // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
     // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
     ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
@@ -11533,10 +11616,9 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   SDValue TVal = Op.getOperand(2);
   SDValue FVal = Op.getOperand(3);
-  bool HasNoNans = Op->getFlags().hasNoNaNs();
+  SDNodeFlags Flags = Op->getFlags();
   SDLoc DL(Op);
-  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL,
-                        DAG);
+  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
@@ -11544,7 +11626,6 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   SDValue CCVal = Op->getOperand(0);
   SDValue TVal = Op->getOperand(1);
   SDValue FVal = Op->getOperand(2);
-  bool HasNoNans = Op->getFlags().hasNoNaNs();
   SDLoc DL(Op);
 
   EVT Ty = Op.getValueType();
@@ -11611,8 +11692,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
                                      DAG.getUNDEF(MVT::f32), FVal);
   }
 
-  SDValue Res =
-      LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG);
+  SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
+                               Op->getFlags(), DL, DAG);
 
   if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -12209,7 +12290,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
       SDLoc DL(Operand);
       EVT VT = Operand.getValueType();
 
-      SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
+      // Ensure nodes can be recognized by isAssociativeAndCommutative.
+      SDNodeFlags Flags =
+          SDNodeFlags::AllowReassociation | SDNodeFlags::NoSignedZeros;
 
       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
@@ -16591,7 +16674,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
   return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
            isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
            (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-            Options.UnsafeFPMath));
+            I->getFastMathFlags().allowContract()));
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -27911,16 +27994,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
          MemVT.getScalarSizeInBits() == 32u ||
          MemVT.getScalarSizeInBits() == 64u)) {
 
+      EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
       SDValue Result = DAG.getMemIntrinsicNode(
           AArch64ISD::LDNP, SDLoc(N),
-          DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
-                         MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
-                         MVT::Other}),
+          DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
           {LoadNode->getChain(), LoadNode->getBasePtr()},
           LoadNode->getMemoryVT(), LoadNode->getMemOperand());
 
       SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
-                                 Result.getValue(0), Result.getValue(1));
+                                 DAG.getBitcast(HalfVT, Result.getValue(0)),
+                                 DAG.getBitcast(HalfVT, Result.getValue(1)));
       Results.append({Pair, Result.getValue(2) /* Chain */});
       return;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d8403c2..ea63edd8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -182,6 +182,13 @@ public:
   MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
 
+  /// Replace (0, vreg) discriminator components with the operands of blend
+  /// or with (immediate, NoRegister) when possible.
+  void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB,
+                                 MachineOperand &IntDiscOp,
+                                 MachineOperand &AddrDiscOp,
+                                 const TargetRegisterClass *AddrDiscRC) const;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
@@ -655,7 +662,7 @@ private:
   SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
                          SDValue TVal, SDValue FVal,
                          iterator_range<SDNode::user_iterator> Users,
-                         bool HasNoNans, const SDLoc &dl,
+                         SDNodeFlags Flags, const SDLoc &dl,
                          SelectionDAG &DAG) const;
   SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 802e4a9..59d4fd2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -20,7 +20,6 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CFIInstBuilder.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -36,7 +35,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackMaps.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -6576,10 +6574,8 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
     // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
     // the target options or if FADD/FSUB has the contract fast-math flag.
-    return Options.UnsafeFPMath ||
-           Options.AllowFPOpFusion == FPOpFusion::Fast ||
+    return Options.AllowFPOpFusion == FPOpFusion::Fast ||
            Inst.getFlag(MachineInstr::FmContract);
-    return true;
   }
   return false;
 }
@@ -6682,9 +6678,8 @@ bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
   case AArch64::FMUL_ZZZ_H:
   case AArch64::FMUL_ZZZ_S:
   case AArch64::FMUL_ZZZ_D:
-    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
-           (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
-            Inst.getFlag(MachineInstr::MIFlag::FmNsz));
+    return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+           Inst.getFlag(MachineInstr::MIFlag::FmNsz);
 
   // == Integer types ==
   // -- Base instructions --
@@ -7354,9 +7349,6 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return true;
   } // end switch (Pattern)
   return false;
@@ -7397,252 +7389,11 @@ static bool getMiscPatterns(MachineInstr &Root,
   return false;
 }
 
-static bool getGatherPattern(MachineInstr &Root,
-                             SmallVectorImpl<unsigned> &Patterns,
-                             unsigned LoadLaneOpCode, unsigned NumLanes) {
-  const MachineFunction *MF = Root.getMF();
-
-  // Early exit if optimizing for size.
-  if (MF->getFunction().hasMinSize())
-    return false;
-
-  const MachineRegisterInfo &MRI = MF->getRegInfo();
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-
-  // The root of the pattern must load into the last lane of the vector.
-  if (Root.getOperand(2).getImm() != NumLanes - 1)
-    return false;
-
-  // Check that we have load into all lanes except lane 0.
-  // For each load we also want to check that:
-  // 1. It has a single non-debug use (since we will be replacing the virtual
-  // register)
-  // 2. That the addressing mode only uses a single offset register.
-  auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
-  auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
-  SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end());
-  while (!RemainingLanes.empty() && CurrInstr &&
-         CurrInstr->getOpcode() == LoadLaneOpCode &&
-         MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
-         CurrInstr->getNumOperands() == 4) {
-    RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
-    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
-  }
-
-  if (!RemainingLanes.empty())
-    return false;
-
-  // Match the SUBREG_TO_REG sequence.
-  if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
-    return false;
-
-  // Verify that the subreg to reg loads an integer into the first lane.
-  auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
-  unsigned SingleLaneSizeInBits = 128 / NumLanes;
-  if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
-    return false;
-
-  // Verify that it also has a single non debug use.
-  if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
-    return false;
-
-  switch (NumLanes) {
-  case 4:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32);
-    break;
-  case 8:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16);
-    break;
-  case 16:
-    Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8);
-    break;
-  default:
-    llvm_unreachable("Got bad number of lanes for gather pattern.");
-  }
-
-  return true;
-}
-
-/// Search for patterns where we use LD1 instructions to load into
-/// separate lanes of an 128 bit Neon register. We can increase Memory Level
-/// Parallelism by loading into 2 Neon registers instead.
-static bool getLoadPatterns(MachineInstr &Root,
-                            SmallVectorImpl<unsigned> &Patterns) {
-
-  // The pattern searches for loads into single lanes.
-  switch (Root.getOpcode()) {
-  case AArch64::LD1i32:
-    return getGatherPattern(Root, Patterns, Root.getOpcode(), 4);
-  case AArch64::LD1i16:
-    return getGatherPattern(Root, Patterns, Root.getOpcode(), 8);
-  case AArch64::LD1i8:
-    return getGatherPattern(Root, Patterns, Root.getOpcode(), 16);
-  default:
-    return false;
-  }
-}
-
-static void
-generateGatherPattern(MachineInstr &Root,
-                      SmallVectorImpl<MachineInstr *> &InsInstrs,
-                      SmallVectorImpl<MachineInstr *> &DelInstrs,
-                      DenseMap<Register, unsigned> &InstrIdxForVirtReg,
-                      unsigned Pattern, unsigned NumLanes) {
-
-  MachineFunction &MF = *Root.getParent()->getParent();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-
-  // Gather the initial load instructions to build the pattern
-  SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
-  MachineInstr *CurrInstr = &Root;
-  for (unsigned i = 0; i < NumLanes - 1; ++i) {
-    LoadToLaneInstrs.push_back(CurrInstr);
-    CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
-  }
-
-  // Sort the load instructions according to the lane.
-  llvm::sort(LoadToLaneInstrs,
-             [](const MachineInstr *A, const MachineInstr *B) {
-               return A->getOperand(2).getImm() > B->getOperand(2).getImm();
-             });
-
-  MachineInstr *SubregToReg = CurrInstr;
-  LoadToLaneInstrs.push_back(
-      MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
-  auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
-
-  const TargetRegisterClass *FPR128RegClass =
-      MRI.getRegClass(Root.getOperand(0).getReg());
-
-  auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr,
-                                Register SrcRegister, unsigned Lane,
-                                Register OffsetRegister) {
-    auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
-    MachineInstrBuilder LoadIndexIntoRegister =
-        BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
-                NewRegister)
-            .addReg(SrcRegister)
-            .addImm(Lane)
-            .addReg(OffsetRegister, getKillRegState(true));
-    InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
-    InsInstrs.push_back(LoadIndexIntoRegister);
-    return NewRegister;
-  };
-
-  // Helper to create load instruction based on opcode
-  auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg,
-                                   Register OffsetReg) -> MachineInstrBuilder {
-    unsigned Opcode;
-    switch (NumLanes) {
-    case 4:
-      Opcode = AArch64::LDRSui;
-      break;
-    case 8:
-      Opcode = AArch64::LDRHui;
-      break;
-    case 16:
-      Opcode = AArch64::LDRBui;
-      break;
-    default:
-      llvm_unreachable(
-          "Got unsupported number of lanes in machine-combiner gather pattern");
-    }
-    // Immediate offset load
-    return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
-        .addReg(OffsetReg)
-        .addImm(0); // immediate offset
-  };
-
-  // Load the remaining lanes into register 0.
-  auto LanesToLoadToReg0 =
-      llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
-                       LoadToLaneInstrsAscending.begin() + NumLanes / 2);
-  auto PrevReg = SubregToReg->getOperand(0).getReg();
-  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
-    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
-                                 LoadInstr->getOperand(3).getReg());
-    DelInstrs.push_back(LoadInstr);
-  }
-  auto LastLoadReg0 = PrevReg;
-
-  // First load into register 1. Perform a LDRSui to zero out the upper lanes in
-  // a single instruction.
-  auto Lane0Load = *LoadToLaneInstrsAscending.begin();
-  auto OriginalSplitLoad =
-      *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
-  auto DestRegForMiddleIndex = MRI.createVirtualRegister(
-      MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
-
-  MachineInstrBuilder MiddleIndexLoadInstr =
-      CreateLoadInstruction(NumLanes, DestRegForMiddleIndex,
-                            OriginalSplitLoad->getOperand(3).getReg());
-
-  InstrIdxForVirtReg.insert(
-      std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
-  InsInstrs.push_back(MiddleIndexLoadInstr);
-  DelInstrs.push_back(OriginalSplitLoad);
-
-  // Subreg To Reg instruction for register 1.
-  auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
-  unsigned SubregType;
-  switch (NumLanes) {
-  case 4:
-    SubregType = AArch64::ssub;
-    break;
-  case 8:
-    SubregType = AArch64::hsub;
-    break;
-  case 16:
-    SubregType = AArch64::bsub;
-    break;
-  default:
-    llvm_unreachable(
-        "Got invalid NumLanes for machine-combiner gather pattern");
-  }
-
-  auto SubRegToRegInstr =
-      BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
-              DestRegForSubregToReg)
-          .addImm(0)
-          .addReg(DestRegForMiddleIndex, getKillRegState(true))
-          .addImm(SubregType);
-  InstrIdxForVirtReg.insert(
-      std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
-  InsInstrs.push_back(SubRegToRegInstr);
-
-  // Load remaining lanes into register 1.
-  auto LanesToLoadToReg1 =
-      llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
-                       LoadToLaneInstrsAscending.end());
-  PrevReg = SubRegToRegInstr->getOperand(0).getReg();
-  for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
-    PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1,
-                                 LoadInstr->getOperand(3).getReg());
-    if (Index == NumLanes / 2 - 2) {
-      break;
-    }
-    DelInstrs.push_back(LoadInstr);
-  }
-  auto LastLoadReg1 = PrevReg;
-
-  // Create the final zip instruction to combine the results.
-  MachineInstrBuilder ZipInstr =
-      BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
-              Root.getOperand(0).getReg())
-          .addReg(LastLoadReg0)
-          .addReg(LastLoadReg1);
-  InsInstrs.push_back(ZipInstr);
-}
-
 CombinerObjective
 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
   switch (Pattern) {
   case AArch64MachineCombinerPattern::SUBADD_OP1:
   case AArch64MachineCombinerPattern::SUBADD_OP2:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i32:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i16:
-  case AArch64MachineCombinerPattern::GATHER_LANE_i8:
     return CombinerObjective::MustReduceDepth;
   default:
     return TargetInstrInfo::getCombinerObjective(Pattern);
@@ -7672,10 +7423,6 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMiscPatterns(Root, Patterns))
     return true;
 
-  // Load patterns
-  if (getLoadPatterns(Root, Patterns))
-    return true;
-
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
                                                      DoRegPressureReduce);
 }
@@ -8931,21 +8678,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
     break;
   }
-  case AArch64MachineCombinerPattern::GATHER_LANE_i32: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
-                          Pattern, 4);
-    break;
-  }
-  case AArch64MachineCombinerPattern::GATHER_LANE_i16: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
-                          Pattern, 8);
-    break;
-  }
-  case AArch64MachineCombinerPattern::GATHER_LANE_i8: {
-    generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
-                          Pattern, 16);
-    break;
-  }
 
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 02734866..7c255da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -172,10 +172,6 @@ enum AArch64MachineCombinerPattern : unsigned {
   FMULv8i16_indexed_OP2,
 
   FNMADD,
-
-  GATHER_LANE_i32,
-  GATHER_LANE_i16,
-  GATHER_LANE_i8
 };
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9ebdf2e..07cacfa 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -519,10 +519,10 @@ def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
 
 def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64ldiapp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
-def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64stilp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
-def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 
 // Generates the general dynamic sequences, i.e.
 //  adrp  x0, :tlsdesc:var
@@ -2033,7 +2033,7 @@ let Predicates = [HasPAuth] in {
     def DZB  : SignAuthZero<prefix_z,  0b11, !strconcat(asm, "dzb"), op>;
   }
 
-  defm PAC : SignAuth<0b000, 0b010, "pac", int_ptrauth_sign>;
+  defm PAC : SignAuth<0b000, 0b010, "pac", null_frag>;
   defm AUT : SignAuth<0b001, 0b011, "aut", null_frag>;
 
   def XPACI : ClearAuth<0, "xpaci">;
@@ -2153,6 +2153,26 @@ let Predicates = [HasPAuth] in {
     let Uses = [];
   }
 
+  // PAC pseudo instruction. In AsmPrinter, it is expanded into an actual PAC*
+  // instruction immediately preceded by the discriminator computation.
+  // This enforces the expected immediate modifier is used for signing, even
+  // if an attacker is able to substitute AddrDisc.
+  def PAC : Pseudo<(outs GPR64:$SignedVal),
+                   (ins GPR64:$Val, i32imm:$Key, i64imm:$Disc, GPR64noip:$AddrDisc),
+                   [], "$SignedVal = $Val">, Sched<[WriteI, ReadI]> {
+    let isCodeGenOnly = 1;
+    let hasSideEffects = 0;
+    let mayStore = 0;
+    let mayLoad = 0;
+    let Size = 12;
+    let Defs = [X16, X17];
+    let usesCustomInserter = 1;
+  }
+
+  // A standalone pattern is used, so that literal 0 can be passed as $Disc.
+  def : Pat<(int_ptrauth_sign GPR64:$Val, timm:$Key, GPR64noip:$AddrDisc),
+            (PAC GPR64:$Val, $Key, 0, GPR64noip:$AddrDisc)>;
+
   // AUT and re-PAC a value, using different keys/data.
   // This directly manipulates x16/x17, which are the only registers that
   // certain OSs guarantee are safe to use for sensitive operations.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index c218831..85de2d5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -36,7 +36,7 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
   // SHF_AARCH64_PURECODE flag set if the "+execute-only" target feature is
   // present.
   if (TM.getMCSubtargetInfo()->hasFeature(AArch64::FeatureExecuteOnly)) {
-    auto *Text = cast<MCSectionELF>(TextSection);
+    auto *Text = static_cast<MCSectionELF *>(TextSection);
     Text->setFlags(Text->getFlags() | ELF::SHF_AARCH64_PURECODE);
   }
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 08f547a..6257e99 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -523,7 +523,8 @@ void AArch64TargetELFStreamer::finish() {
   // mark it execute-only if it is empty and there is at least one
   // execute-only section in the object.
   if (any_of(Asm, [](const MCSection &Sec) {
-        return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_AARCH64_PURECODE;
+        return static_cast<const MCSectionELF &>(Sec).getFlags() &
+               ELF::SHF_AARCH64_PURECODE;
       })) {
     auto *Text =
         static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 1ac340a..a22a17a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -132,7 +132,8 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section,
   // But only if they don't point to a few forbidden sections.
   if (!Symbol.isInSection())
     return true;
-  const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+  const MCSectionMachO &RefSec =
+      static_cast<MCSectionMachO &>(Symbol.getSection());
   if (RefSec.getType() == MachO::S_CSTRING_LITERALS)
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8b8fc8b..a17fb93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -286,6 +286,12 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
   "VMEM CU scope prefetches do not fail on illegal address"
 >;
 
+def FeatureCUStores : SubtargetFeature<"cu-stores",
+  "HasCUStores",
+  "true",
+  "Whether SCOPE_CU stores can be used on GFX12.5"
+>;
+
 def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
   "HasVcmpxExecWARHazard",
   "true",
@@ -1988,6 +1994,7 @@ def FeatureISAVersion12 : FeatureSet<
 def FeatureISAVersion12_50 : FeatureSet<
   [FeatureGFX12,
    FeatureGFX1250Insts,
+   FeatureCUStores,
    FeatureCuMode,
    Feature64BitLiterals,
    FeatureLDSBankCount32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4b3dc37..6681393 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
   MCContext &Ctx = MF.getContext();
   uint16_t KernelCodeProperties = 0;
   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
     KernelCodeProperties |=
@@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
   }
-  if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
+  if (ST.isWave32()) {
     KernelCodeProperties |=
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
   }
+  if (isGFX1250(ST) && ST.hasCUStores()) {
+    KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
+  }
 
   // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
   // un-evaluatable at this point so it cannot be conditionally checked here.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 49d8b44..59cc1df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,7 +13,6 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e3ca09e..6118933 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -391,8 +391,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // Library functions.  These default to Expand, but we have instructions
   // for them.
   setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
-                      ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
-                     MVT::f32, Legal);
+                      ISD::FROUNDEVEN, ISD::FTRUNC},
+                     {MVT::f16, MVT::f32}, Legal);
+  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
 
   setOperationAction(ISD::FLOG2, MVT::f32, Custom);
   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
@@ -412,9 +413,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
-  if (Subtarget->has16BitInsts())
+  if (Subtarget->has16BitInsts()) {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
-  else {
+    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+  } else {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
     setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
   }
@@ -4844,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
-// Detect when CMP and SELECT use the same constant and fold them to avoid
-// loading the constant twice. Specifically handles patterns like:
-// %cmp = icmp eq i32 %val, 4242
-// %sel = select i1 %cmp, i32 4242, i32 %other
-// It can be optimized to reuse %val instead of 4242 in select.
-static SDValue
-foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                                const AMDGPUSubtarget *ST) {
-  SDValue Cond = N->getOperand(0);
-  SDValue TrueVal = N->getOperand(1);
-  SDValue FalseVal = N->getOperand(2);
-
-  // Check if condition is a comparison.
-  if (Cond.getOpcode() != ISD::SETCC)
-    return SDValue();
-
-  SDValue LHS = Cond.getOperand(0);
-  SDValue RHS = Cond.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-
-  bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
-  bool isInteger = LHS.getValueType().isInteger();
-
-  // Handle simple floating-point and integer types only.
-  if (!isFloatingPoint && !isInteger)
-    return SDValue();
-
-  bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
-  bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
-  if (!isEquality && !isNonEquality)
-    return SDValue();
-
-  SDValue ArgVal, ConstVal;
-  if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
-      (isInteger && isa<ConstantSDNode>(RHS))) {
-    ConstVal = RHS;
-    ArgVal = LHS;
-  } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
-             (isInteger && isa<ConstantSDNode>(LHS))) {
-    ConstVal = LHS;
-    ArgVal = RHS;
-  } else {
-    return SDValue();
-  }
-
-  // Check if constant should not be optimized - early return if not.
-  if (isFloatingPoint) {
-    const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
-    const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST);
-
-    // Only optimize normal floating-point values (finite, non-zero, and
-    // non-subnormal as per IEEE 754), skip optimization for inlinable
-    // floating-point constants.
-    if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val))
-      return SDValue();
-  } else {
-    int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
-
-    // Skip optimization for inlinable integer immediates.
-    // Inlinable immediates include: -16 to 64 (inclusive).
-    if (IntVal >= -16 && IntVal <= 64)
-      return SDValue();
-  }
-
-  // For equality and non-equality comparisons, patterns:
-  // select (setcc x, const), const, y -> select (setcc x, const), x, y
-  // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
-  if (!(isEquality && TrueVal == ConstVal) &&
-      !(isNonEquality && FalseVal == ConstVal))
-    return SDValue();
-
-  SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
-  SDValue SelectRHS =
-      (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
-  return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
-                         SelectLHS, SelectRHS);
-}
-
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
     return Folded;
 
-  // Try to fold CMP + SELECT patterns with shared constants (both FP and
-  // integer).
-  if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget))
-    return Folded;
-
   SDValue Cond = N->getOperand(0);
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c865082..38f9ee5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -836,8 +836,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           // When we are not using -fgpu-rdc, we can run accelerator code
           // selection relatively early, but still after linking to prevent
           // eager removal of potentially reachable symbols.
-          if (EnableHipStdPar)
+          if (EnableHipStdPar) {
+            PM.addPass(HipStdParMathFixupPass());
             PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+          }
           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
         }
 
@@ -916,8 +918,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // selection after linking to prevent, otherwise we end up removing
         // potentially reachable symbols that were exported as external in other
         // modules.
-        if (EnableHipStdPar)
+        if (EnableHipStdPar) {
+          PM.addPass(HipStdParMathFixupPass());
           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
+        }
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 421fc42..44e65b3 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -6066,6 +6066,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        ExprVal, ValRange);
       if (Val)
         ImpliedUserSGPRCount += 1;
+    } else if (ID == ".amdhsa_uses_cu_stores") {
+      if (!isGFX1250())
+        return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
+
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
     } else if (ID == ".amdhsa_wavefront_size32") {
       EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
       if (IVersion.Major < 10)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 5c1989b..ffe6b06 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
                       KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
     PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
                     KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
+    if (isGFX1250())
+      PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
+                      KERNEL_CODE_PROPERTY_USES_CU_STORES);
 
     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
       return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 7207c25..0f172e0d 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -369,31 +369,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
   }
 }
 
-class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+// Async loads, introduced in gfx1250, will store directly
+// to a DS address in vdst (they will not use M0 for DS addess).
+class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
   opName,
   (outs ),
   !con(
-      !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
-      (ins flat_offset:$offset, CPol_0:$cpol)),
-  " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
-  let LGKM_CNT = 1;
+       !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
+       !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
+       (ins flat_offset:$offset, CPol_0:$cpol)),
+  !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+  let LGKM_CNT = !not(IsAsync);
+  let VM_CNT = !not(IsAsync);
+  let ASYNC_CNT = IsAsync;
   let is_flat_global = 1;
   let lds = 1;
   let has_data = 0;
+  let has_vdst = IsAsync; // vdst for ds address with IsAsync
+  let mayLoad = 1;
+  let mayStore = 1;
+  let has_saddr = 1;
+  let enabled_saddr = EnableSaddr;
+  let VALU = 1;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+  let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]);
+  let Defs = !if(IsAsync, [ASYNCcnt], []);
+  let SchedRW = [WriteVMEM, WriteLDS];
+}
+
+multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
+  def ""     : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
+    GlobalSaddrTable<0, opName>;
+  def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
+    GlobalSaddrTable<1, opName>;
+}
+
+class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
+  opName,
+  (outs ),
+  !con(
+      !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
+      (ins flat_offset:$offset, CPol_0:$cpol)),
+  " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
+  let VM_CNT = 0;
+  let ASYNC_CNT = 1;
+  let is_flat_global = 1;
+  let lds = 1;
+  let has_data = 1; // vdata for ds address
   let has_vdst = 0;
   let mayLoad = 1;
   let mayStore = 1;
   let has_saddr = 1;
   let enabled_saddr = EnableSaddr;
   let VALU = 1;
-  let Uses = [M0, EXEC];
+  let Uses = [EXEC, ASYNCcnt];
+  let Defs = [ASYNCcnt];
   let SchedRW = [WriteVMEM, WriteLDS];
 }
 
-multiclass FLAT_Global_Load_LDS_Pseudo<string opName> {
-  def ""     : FLAT_Global_Load_LDS_Pseudo<opName>,
+multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> {
+  def ""     : FLAT_Global_STORE_LDS_Pseudo<opName>,
     GlobalSaddrTable<0, opName>;
-  def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>,
+  def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>,
     GlobalSaddrTable<1, opName>;
 }
 
@@ -1156,6 +1193,15 @@ let SubtargetPredicate = isGFX12Plus in {
 
 let SubtargetPredicate = isGFX1250Plus in {
 
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8       :  FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8",    1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32      :  FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32",   1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64      :  FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64",   1>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128     :  FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128",  1>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8    :  FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32   :  FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64   :  FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128  :  FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">;
+
 def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
 def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
 } // End SubtargetPredicate = isGFX1250Plus
@@ -3374,6 +3420,15 @@ defm GLOBAL_LOAD_MONITOR_B32          : VFLAT_Real_AllAddr_gfx1250<0x070>;
 defm GLOBAL_LOAD_MONITOR_B64          : VFLAT_Real_AllAddr_gfx1250<0x071>;
 defm GLOBAL_LOAD_MONITOR_B128         : VFLAT_Real_AllAddr_gfx1250<0x072>;
 
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B8      : VFLAT_Real_AllAddr_gfx1250<0x5f>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B32     : VFLAT_Real_AllAddr_gfx1250<0x60>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B64     : VFLAT_Real_AllAddr_gfx1250<0x61>;
+defm GLOBAL_LOAD_ASYNC_TO_LDS_B128    : VFLAT_Real_AllAddr_gfx1250<0x62>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B8   : VFLAT_Real_AllAddr_gfx1250<0x63>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B32  : VFLAT_Real_AllAddr_gfx1250<0x64>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B64  : VFLAT_Real_AllAddr_gfx1250<0x65>;
+defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>;
+
 defm GLOBAL_LOAD_TR_B128_w32          : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
 defm GLOBAL_LOAD_TR_B64_w32           : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 7d6723a..334afd3 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -38,7 +38,11 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
 
 unsigned GCNRegPressure::getRegKind(const TargetRegisterClass *RC,
                                     const SIRegisterInfo *STI) {
-  return STI->isSGPRClass(RC) ? SGPR : (STI->isAGPRClass(RC) ? AGPR : VGPR);
+  return STI->isSGPRClass(RC)
+             ? SGPR
+             : (STI->isAGPRClass(RC)
+                    ? AGPR
+                    : (STI->isVectorSuperClass(RC) ? AVGPR : VGPR));
 }
 
 void GCNRegPressure::inc(unsigned Reg,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 3749b6d..ea33a22 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -29,43 +29,57 @@ class raw_ostream;
 class SlotIndex;
 
 struct GCNRegPressure {
-  enum RegKind { SGPR, VGPR, AGPR, TOTAL_KINDS };
+  enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
 
   GCNRegPressure() {
     clear();
   }
 
-  bool empty() const { return !Value[SGPR] && !Value[VGPR] && !Value[AGPR]; }
+  bool empty() const {
+    return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
+  }
 
   void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
 
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR]; }
-  /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p
-  /// UnifiedVGPRFile
+  /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
+  /// dependent upon \p UnifiedVGPRFile
   unsigned getVGPRNum(bool UnifiedVGPRFile) const {
     if (UnifiedVGPRFile) {
-      return Value[AGPR] ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR])
-                         : Value[VGPR];
+      return Value[AGPR]
+                 ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
+                 : Value[VGPR] + Value[AVGPR];
     }
-    return std::max(Value[VGPR], Value[AGPR]);
+    // AVGPR assignment priority is based on the width of the register. Account
+    // AVGPR pressure as VGPR.
+    return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
   }
 
   /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
-  /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
+  /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
+  /// VGPR file.
   inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
-                                           unsigned NumAGPRs) {
-    return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
+                                           unsigned NumAGPRs,
+                                           unsigned NumAVGPRs) {
+
+    // Assume AVGPRs will be assigned as VGPRs.
+    return alignTo(NumArchVGPRs + NumAVGPRs,
+                   AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
            NumAGPRs;
   }
 
-  /// \returns the ArchVGPR32 pressure
-  unsigned getArchVGPRNum() const { return Value[VGPR]; }
+  /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
+  /// allocated as VGPR
+  unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
   /// \returns the AccVGPR32 pressure
   unsigned getAGPRNum() const { return Value[AGPR]; }
+  /// \returns the AVGPR32 pressure
+  unsigned getAVGPRNum() const { return Value[AVGPR]; }
 
   unsigned getVGPRTuplesWeight() const {
-    return std::max(Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR]);
+    return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
+                    Value[TOTAL_KINDS + AGPR]);
   }
   unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a655308..ce1ce68 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1911,14 +1911,12 @@ void PreRARematStage::rematerialize() {
   for (auto &[DefMI, Remat] : Rematerializations) {
     MachineBasicBlock::iterator InsertPos(Remat.UseMI);
     Register Reg = DefMI->getOperand(0).getReg();
-    unsigned SubReg = DefMI->getOperand(0).getSubReg();
     unsigned DefRegion = MIRegion.at(DefMI);
 
     // Rematerialize DefMI to its use block.
-    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
-                       *DAG.TRI);
+    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+                       AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
     Remat.RematMI = &*std::prev(InsertPos);
-    Remat.RematMI->getOperand(0).setSubReg(SubReg);
     DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
 
     // Update region boundaries in regions we sinked from (remove defining MI)
@@ -2064,14 +2062,13 @@ void PreRARematStage::finalizeGCNSchedStage() {
     MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
     MachineBasicBlock *MBB = RegionBB[DefRegion];
     Register Reg = RematMI.getOperand(0).getReg();
-    unsigned SubReg = RematMI.getOperand(0).getSubReg();
 
     // Re-rematerialize MI at the end of its original region. Note that it may
     // not be rematerialized exactly in the same position as originally within
     // the region, but it should not matter much.
-    TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
+    TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
+                       *DAG.TRI);
     MachineInstr *NewMI = &*std::prev(InsertPos);
-    NewMI->getOperand(0).setSubReg(SubReg);
     DAG.LIS->InsertMachineInstrInMaps(*NewMI);
 
     auto UseRegion = MIRegion.find(Remat.UseMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 9a2bab1..0a0a107 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -537,6 +537,63 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
   return getMaxNumVGPRs(MF.getFunction());
 }
 
+std::pair<unsigned, unsigned>
+GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
+  const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
+
+  unsigned MaxNumVGPRs = MaxVectorRegs;
+  unsigned MaxNumAGPRs = 0;
+
+  // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
+  // a wave may have up to 512 total vector registers combining together both
+  // VGPRs and AGPRs. Hence, in an entry function without calls and without
+  // AGPRs used within it, it is possible to use the whole vector register
+  // budget for VGPRs.
+  //
+  // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
+  //       register file accordingly.
+  if (hasGFX90AInsts()) {
+    unsigned MinNumAGPRs = 0;
+    const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
+    const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+    const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
+
+    // TODO: The lower bound should probably force the number of required
+    // registers up, overriding amdgpu-waves-per-eu.
+    std::tie(MinNumAGPRs, MaxNumAGPRs) =
+        AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
+                                        /*OnlyFirstRequired=*/true);
+
+    if (MinNumAGPRs == DefaultNumAGPR.first) {
+      // Default to splitting half the registers if AGPRs are required.
+      MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
+    } else {
+      // Align to accum_offset's allocation granularity.
+      MinNumAGPRs = alignTo(MinNumAGPRs, 4);
+
+      MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
+    }
+
+    // Clamp values to be inbounds of our limits, and ensure min <= max.
+
+    MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
+    MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
+
+    MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
+    MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
+
+    assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
+           MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
+           "invalid register counts");
+  } else if (hasMAIInsts()) {
+    // On gfx908 the number of AGPRs always equals the number of VGPRs.
+    MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
+  }
+
+  return std::pair(MaxNumVGPRs, MaxNumAGPRs);
+}
+
 void GCNSubtarget::adjustSchedDependency(
     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
     const TargetSchedModel *SchedModel) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 88a269f..10ded0e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -248,6 +248,7 @@ protected:
   bool HasVmemPrefInsts = false;
   bool HasSafeSmemPrefetch = false;
   bool HasSafeCUPrefetch = false;
+  bool HasCUStores = false;
   bool HasVcmpxExecWARHazard = false;
   bool HasLdsBranchVmemWARHazard = false;
   bool HasNSAtoVMEMBug = false;
@@ -998,6 +999,8 @@ public:
 
   bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
 
+  bool hasCUStores() const { return HasCUStores; }
+
   // Has s_cmpk_* instructions.
   bool hasSCmpK() const { return getGeneration() < GFX12; }
 
@@ -1667,6 +1670,10 @@ public:
     return getMaxNumVGPRs(F);
   }
 
+  /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
+  /// of waves per execution unit required for the function \p MF.
+  std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
+
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 10f6d33..43ca548 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
       amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
       ".amdhsa_user_sgpr_private_segment_size");
+  if (isGFX1250(STI))
+    PrintField(KD.kernel_code_properties,
+               amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
+               amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
+               ".amdhsa_uses_cu_stores");
   if (IVersion.Major >= 10)
     PrintField(KD.kernel_code_properties,
                amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index e5d1eaa..b77da4d 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1062,9 +1062,13 @@ bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
     switch (OpTy) {
     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
       OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
       break;
     case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
       OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
       break;
     default:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0eee7ad..9017f4f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14179,6 +14179,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
       (VT == MVT::f32 || VT == MVT::f64 ||
        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
+       (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
+       (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
       Op0.hasOneUse()) {
     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
@@ -15894,6 +15896,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
   return SDValue(CSrc, 0);
 }
 
+SDValue SITargetLowering::performSelectCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+
+  // Try to fold CMP + SELECT patterns with shared constants (both FP and
+  // integer).
+  // Detect when CMP and SELECT use the same constant and fold them to avoid
+  // loading the constant twice. Specifically handles patterns like:
+  // %cmp = icmp eq i32 %val, 4242
+  // %sel = select i1 %cmp, i32 4242, i32 %other
+  // It can be optimized to reuse %val instead of 4242 in select.
+  SDValue Cond = N->getOperand(0);
+  SDValue TrueVal = N->getOperand(1);
+  SDValue FalseVal = N->getOperand(2);
+
+  // Check if condition is a comparison.
+  if (Cond.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  SDValue LHS = Cond.getOperand(0);
+  SDValue RHS = Cond.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
+  bool isInteger = LHS.getValueType().isInteger();
+
+  // Handle simple floating-point and integer types only.
+  if (!isFloatingPoint && !isInteger)
+    return SDValue();
+
+  bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
+  bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
+  if (!isEquality && !isNonEquality)
+    return SDValue();
+
+  SDValue ArgVal, ConstVal;
+  if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
+      (isInteger && isa<ConstantSDNode>(RHS))) {
+    ConstVal = RHS;
+    ArgVal = LHS;
+  } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
+             (isInteger && isa<ConstantSDNode>(LHS))) {
+    ConstVal = LHS;
+    ArgVal = RHS;
+  } else {
+    return SDValue();
+  }
+
+  // Skip optimization for inlinable immediates.
+  if (isFloatingPoint) {
+    const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
+    if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
+      return SDValue();
+  } else {
+    if (AMDGPU::isInlinableIntLiteral(
+            cast<ConstantSDNode>(ConstVal)->getSExtValue()))
+      return SDValue();
+  }
+
+  // For equality and non-equality comparisons, patterns:
+  // select (setcc x, const), const, y -> select (setcc x, const), x, y
+  // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
+  if (!(isEquality && TrueVal == ConstVal) &&
+      !(isNonEquality && FalseVal == ConstVal))
+    return SDValue();
+
+  SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
+  SDValue SelectRHS =
+      (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
+  return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
+                         SelectLHS, SelectRHS);
+}
+
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
@@ -15942,6 +16016,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performFMulCombine(N, DCI);
   case ISD::SETCC:
     return performSetCCCombine(N, DCI);
+  case ISD::SELECT:
+    if (auto Res = performSelectCombine(N, DCI))
+      return Res;
+    break;
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
   case ISD::FMAXNUM_IEEE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index acf6158..dedd9ae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -211,6 +211,7 @@ private:
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
   unsigned getFusedOpcode(const SelectionDAG &DAG,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dd3f2fe..520c321 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -552,7 +552,7 @@ public:
         (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
       // FLAT and SCRATCH instructions may access scratch. Other VMEM
       // instructions do not.
-      if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+      if (TII->mayAccessScratchThroughFlat(Inst))
         return SCRATCH_WRITE_ACCESS;
       return VMEM_WRITE_ACCESS;
     }
@@ -565,7 +565,6 @@ public:
 
   bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
   bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
-  bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
   bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
@@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
   return false;
 }
 
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either scratch or FLAT.
-bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
-    const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // SCRATCH instructions always access scratch.
-  if (TII->isFLATScratch(MI))
-    return true;
-
-  // GLOBAL instructions never access scratch.
-  if (TII->isFLATGlobal(MI))
-    return false;
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access scratch.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves scratch.
-  return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
-    unsigned AS = Memop->getAddrSpace();
-    return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
-  });
-}
-
 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
   return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
          (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 40e6871..2aa6b4e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2508,7 +2508,20 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
           .addReg(DstHi);
     }
     break;
+
+  case AMDGPU::V_MAX_BF16_PSEUDO_e64:
+    assert(ST.hasBF16PackedInsts());
+    MI.setDesc(get(AMDGPU::V_PK_MAX_NUM_BF16));
+    MI.addOperand(MachineOperand::CreateImm(0)); // op_sel
+    MI.addOperand(MachineOperand::CreateImm(0)); // neg_lo
+    MI.addOperand(MachineOperand::CreateImm(0)); // neg_hi
+    auto Op0 = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+    Op0->setImm(Op0->getImm() | SISrcMods::OP_SEL_1);
+    auto Op1 = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+    Op1->setImm(Op1->getImm() | SISrcMods::OP_SEL_1);
+    break;
   }
+
   return true;
 }
 
@@ -2733,49 +2746,47 @@ static MachineInstr *swapImmOperands(MachineInstr &MI,
 }
 
 bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
-                                const MachineOperand *MO0, unsigned OpIdx1,
-                                const MachineOperand *MO1) const {
+                                unsigned OpIdx1) const {
   const MCInstrDesc &InstDesc = MI.getDesc();
   const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
   const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
-  const TargetRegisterClass *DefinedRC1 =
-      OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
-  const TargetRegisterClass *DefinedRC0 =
-      OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
 
   unsigned Opc = MI.getOpcode();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
 
+  const MachineOperand &MO0 = MI.getOperand(OpIdx0);
+  const MachineOperand &MO1 = MI.getOperand(OpIdx1);
+
   // Swap doesn't breach constant bus or literal limits
   // It may move literal to position other than src0, this is not allowed
   // pre-gfx10 However, most test cases need literals in Src0 for VOP
   // FIXME: After gfx9, literal can be in place other than Src0
   if (isVALU(MI)) {
-    if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
-        !isInlineConstant(*MO0, OpInfo1))
+    if ((int)OpIdx0 == Src0Idx && !MO0.isReg() &&
+        !isInlineConstant(MO0, OpInfo1))
       return false;
-    if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
-        !isInlineConstant(*MO1, OpInfo0))
+    if ((int)OpIdx1 == Src0Idx && !MO1.isReg() &&
+        !isInlineConstant(MO1, OpInfo0))
       return false;
   }
 
-  if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
-    if (!DefinedRC1)
+  if ((int)OpIdx1 != Src0Idx && MO0.isReg()) {
+    if (OpInfo1.RegClass == -1)
       return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
-    return isLegalRegOperand(MI, OpIdx1, *MO0) &&
-           (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1));
+    return isLegalRegOperand(MI, OpIdx1, MO0) &&
+           (!MO1.isReg() || isLegalRegOperand(MI, OpIdx0, MO1));
   }
-  if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
-    if (!DefinedRC0)
+  if ((int)OpIdx0 != Src0Idx && MO1.isReg()) {
+    if (OpInfo0.RegClass == -1)
       return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
-    return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) &&
-           isLegalRegOperand(MI, OpIdx0, *MO1);
+    return (!MO0.isReg() || isLegalRegOperand(MI, OpIdx1, MO0)) &&
+           isLegalRegOperand(MI, OpIdx0, MO1);
   }
 
   // No need to check 64-bit literals since swapping does not bring new
   // 64-bit literals into current instruction to fold to 32-bit
 
-  return isImmOperandLegal(MI, OpIdx1, *MO0);
+  return isImmOperandLegal(MI, OpIdx1, MO0);
 }
 
 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -2797,12 +2808,12 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
            static_cast<int>(Src1Idx) &&
          "inconsistency with findCommutedOpIndices");
 
-  MachineOperand &Src0 = MI.getOperand(Src0Idx);
-  MachineOperand &Src1 = MI.getOperand(Src1Idx);
-  if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
+  if (!isLegalToSwap(MI, Src0Idx, Src1Idx))
     return nullptr;
-  }
+
   MachineInstr *CommutedMI = nullptr;
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
+  MachineOperand &Src1 = MI.getOperand(Src1Idx);
   if (Src0.isReg() && Src1.isReg()) {
     // Be sure to copy the source modifiers to the right place.
     CommutedMI =
@@ -4238,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
          Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
 }
 
+bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
+  if (!isFLAT(MI) || isFLATGlobal(MI))
+    return false;
+
+  // If scratch is not initialized, we can never access it.
+  if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
+    return false;
+
+  // SCRATCH instructions always access scratch.
+  if (isFLATScratch(MI))
+    return true;
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access scratch.
+  if (MI.memoperands_empty())
+    return true;
+
+  // TODO (?): Does this need to be taught how to read noalias.addrspace ?
+
+  // See if any memory operand specifies an address space that involves scratch.
+  return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+    unsigned AS = Memop->getAddrSpace();
+    return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+  });
+}
+
 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
   // Skip the full operand and register alias search modifiesRegister
   // does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 800ea9a..e042b59 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -197,8 +197,7 @@ protected:
                            AMDGPU::OpName Src0OpName, MachineOperand &Src1,
                            AMDGPU::OpName Src1OpName) const;
   bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx,
-                     const MachineOperand *fromMO, unsigned toIdx,
-                     const MachineOperand *toMO) const;
+                     unsigned toIdx) const;
   MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                        unsigned OpIdx0,
                                        unsigned OpIdx1) const override;
@@ -679,6 +678,12 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
   }
 
+  /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
+  /// SCRATCH_ memory operands.
+  /// Conservatively correct; will return true if \p MI cannot be proven
+  /// to not hit scratch.
+  bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
+
   static bool isBlockLoadStore(uint16_t Opcode) {
     switch (Opcode) {
     case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b0be3f86..83b0490 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2865,6 +2865,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
 def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
 def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
 def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp=*/1>;
+def VOP_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, untyped]>;
 
 def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d05be8f..54fa192 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1894,6 +1894,9 @@ let SubtargetPredicate = UseRealTrue16Insts in
 def : ClampPat<V_MAX_F16_t16_e64, f16>;
 let SubtargetPredicate = UseFakeTrue16Insts in
 def : ClampPat<V_MAX_F16_fake16_e64, f16>;
+// FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+let True16Predicate = UseFakeTrue16Insts in
+def : ClampPat<V_MAX_BF16_PSEUDO_e64, bf16>;
 
 let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
@@ -1903,6 +1906,13 @@ def : GCNPat <
 >;
 }
 
+let SubtargetPredicate = HasBF16PackedInsts in {
+def : GCNPat <
+  (v2bf16 (AMDGPUclamp (VOP3PMods v2bf16:$src0, i32:$src0_modifiers))),
+  (V_PK_MAX_NUM_BF16 $src0_modifiers, $src0,
+                     $src0_modifiers, $src0, DSTCLAMP.ENABLE)
+>;
+} // End SubtargetPredicate = HasBF16PackedInsts
 
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 9f61bf8..9509199 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -351,6 +351,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   BitVector ReservedRegs = TRI->getReservedRegs(MF);
   BitVector NonWwmAllocMask(TRI->getNumRegs());
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   // FIXME: MaxNumVGPRsForWwmAllocation might need to be adjusted in the future
   // to have a balanced allocation between WWM values and per-thread vector
@@ -359,7 +360,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
   NumRegs =
       std::min(static_cast<unsigned>(MFI->getSGPRSpillVGPRs().size()), NumRegs);
 
-  auto [MaxNumVGPRs, MaxNumAGPRs] = TRI->getMaxNumVectorRegs(MF);
+  auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
   // Try to use the highest available registers for now. Later after
   // vgpr-regalloc, they can be shifted to the lowest range.
   unsigned I = 0;
@@ -376,7 +377,7 @@ void SILowerSGPRSpills::determineRegsForWWMAllocation(MachineFunction &MF,
     // Reserve an arbitrary register and report the error.
     TRI->markSuperRegs(RegMask, AMDGPU::VGPR0);
     MF.getFunction().getContext().emitError(
-        "can't find enough VGPRs for wwm-regalloc");
+        "cannot find enough VGPRs for wwm-regalloc");
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index f0be204..9a1448f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -81,11 +81,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
   }
 
-  MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
-  if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
-      ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
-      !mayUseAGPRs(F))
-    MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+  MayNeedAGPRs = ST.hasMAIInsts();
+  if (ST.hasGFX90AInsts()) {
+    // FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
+    // should be separated from availability of AGPRs
+    if (MFMAVGPRForm ||
+        (ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
+         !mayUseAGPRs(F)))
+      MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+  }
 
   if (AMDGPU::isChainCC(CC)) {
     // Chain functions don't receive an SP from their caller, but are free to
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420..025731a 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,7 +321,7 @@ public:
                                               bool IsNonTemporal,
                                               bool IsLastUse = false) const = 0;
 
-  virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+  virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
     return false;
   };
 
@@ -602,7 +602,7 @@ public:
                                       bool IsVolatile, bool IsNonTemporal,
                                       bool IsLastUse) const override;
 
-  bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
+  bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
 
   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2536,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   if (IsVolatile) {
     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
 
-    if (Op == SIMemOp::STORE)
-      Changed |= insertWaitsBeforeSystemScopeStore(MI);
-
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
@@ -2551,11 +2548,26 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
-bool SIGfx12CacheControl::expandSystemScopeStore(
-    MachineBasicBlock::iterator &MI) const {
-  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
-  if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
-    return insertWaitsBeforeSystemScopeStore(MI);
+bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
+  MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+  if (!CPol)
+    return false;
+
+  const unsigned Scope = CPol->getImm() & CPol::SCOPE;
+
+  // GFX12.0 only: Extra waits needed before system scope stores.
+  if (!ST.hasGFX1250Insts()) {
+    if (!Atomic && Scope == CPol::SCOPE_SYS)
+      return insertWaitsBeforeSystemScopeStore(MI);
+    return false;
+  }
+
+  // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
+  // space.
+  // We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
+  if (Scope == CPol::SCOPE_CU &&
+      (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
+    return setScope(MI, CPol::SCOPE_SE);
 
   return false;
 }
@@ -2658,6 +2670,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   assert(!MI->mayLoad() && MI->mayStore());
 
   bool Changed = false;
+  // FIXME: Necessary hack because iterator can lose track of the store.
+  MachineInstr &StoreMI = *MI;
 
   if (MOI.isAtomic()) {
     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2674,6 +2688,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
                                    MOI.getIsCrossAddressSpaceOrdering(),
                                    Position::BEFORE);
 
+    Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
     return Changed;
   }
 
@@ -2686,7 +2701,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
 
   // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
   // instruction field, do not confuse it with atomic scope.
-  Changed |= CC->expandSystemScopeStore(MI);
+  Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 84cfa87..f3acc5c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -572,65 +572,6 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
   return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
 }
 
-std::pair<unsigned, unsigned>
-SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const {
-  const unsigned MaxVectorRegs = ST.getMaxNumVGPRs(MF);
-
-  unsigned MaxNumVGPRs = MaxVectorRegs;
-  unsigned MaxNumAGPRs = 0;
-
-  // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
-  // a wave may have up to 512 total vector registers combining together both
-  // VGPRs and AGPRs. Hence, in an entry function without calls and without
-  // AGPRs used within it, it is possible to use the whole vector register
-  // budget for VGPRs.
-  //
-  // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
-  //       register file accordingly.
-  if (ST.hasGFX90AInsts()) {
-    unsigned MinNumAGPRs = 0;
-    const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
-    const unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
-
-    const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
-
-    // TODO: Move this logic into subtarget on IR function
-    //
-    // TODO: The lower bound should probably force the number of required
-    // registers up, overriding amdgpu-waves-per-eu.
-    std::tie(MinNumAGPRs, MaxNumAGPRs) = AMDGPU::getIntegerPairAttribute(
-        MF.getFunction(), "amdgpu-agpr-alloc", DefaultNumAGPR,
-        /*OnlyFirstRequired=*/true);
-
-    if (MinNumAGPRs == DefaultNumAGPR.first) {
-      // Default to splitting half the registers if AGPRs are required.
-      MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
-    } else {
-      // Align to accum_offset's allocation granularity.
-      MinNumAGPRs = alignTo(MinNumAGPRs, 4);
-
-      MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
-    }
-
-    // Clamp values to be inbounds of our limits, and ensure min <= max.
-
-    MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
-    MinNumAGPRs = std::min(std::min(MinNumAGPRs, TotalNumAGPRs), MaxNumAGPRs);
-
-    MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, TotalNumVGPRs);
-    MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
-
-    assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
-           MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= TotalNumVGPRs &&
-           "invalid register counts");
-  } else if (ST.hasMAIInsts()) {
-    // On gfx908 the number of AGPRs always equals the number of VGPRs.
-    MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
-  }
-
-  return std::pair(MaxNumVGPRs, MaxNumAGPRs);
-}
-
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::MODE);
@@ -742,7 +683,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // Reserve VGPRs/AGPRs.
   //
-  auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
+  auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
 
   for (const TargetRegisterClass *RC : regclasses()) {
     if (RC->isBaseClass() && isVGPRClass(RC)) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 0008e5f..5508f07 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -90,11 +90,6 @@ public:
   /// spilling is needed.
   MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
 
-  /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
-  /// of waves per execution unit required for the function \p MF.
-  std::pair<unsigned, unsigned>
-  getMaxNumVectorRegs(const MachineFunction &MF) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isAsmClobberable(const MachineFunction &MF,
                         MCRegister PhysReg) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 0039d2f..218841d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -109,6 +109,23 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
   let TSFlags{2} = HasVGPR;
   let TSFlags{3} = HasAGPR;
   let TSFlags{4} = HasSGPR;
+
+  // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) 
+  // to decide which registers to try to assign first. Usually, this RegisterClass priority is given
+  // very high priority, if not the highest priority, when considering which VirtReg to allocate next.
+  //
+  // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to 
+  // assign more constrained RegisterClasses first. As a result, we prioritize register classes with 
+  // more 32 bit tuples (e.g. VReg_512) over registers with fewer tuples (e.g. VGPR_32). 
+  // 
+  // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs.
+  // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained
+  // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the 
+  // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor
+  // is used for scaling of the bit (i.e. 1 << 4).
+  field int BaseClassPriority = 1;
+  field int BaseClassScaleFactor = 16;
+
 }
 
 multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -575,7 +592,7 @@ let HasVGPR = 1 in {
 def VGPR_16 : SIRegisterClass<"AMDGPU",  Reg16Types.types, 16,
                             (add (interleave (sequence "VGPR%u_LO16", 0, 255),
                                              (sequence "VGPR%u_HI16", 0, 255)))> {
-  let AllocationPriority = 2;
+  let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Size = 16;
   let GeneratePressureSet = 0;
 
@@ -601,7 +618,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU",  Reg16Types.types, 16,
 // i16/f16 only on VI+
 def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                             (add (sequence "VGPR%u", 0, 255))> {
-  let AllocationPriority = 0;
+  let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Size = 32;
   let Weight = 1;
   let BaseClassOrder = 32;
@@ -610,7 +627,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types
 // Identical to VGPR_32 except it only contains the low 128 (Lo128) registers.
 def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                             (add (sequence "VGPR%u", 0, 127))> {
-  let AllocationPriority = 0;
+  let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
   let GeneratePressureSet = 0;
   let Size = 32;
   let Weight = 1;
@@ -668,7 +685,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
 // AccVGPR 32-bit registers
 def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
                             (add (sequence "AGPR%u", 0, 255))> {
-  let AllocationPriority = 0;
+  let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Size = 32;
   let Weight = 1;
   let BaseClassOrder = 32;
@@ -940,14 +957,23 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
 
   // Requires n v_mov_b32 to copy
   let CopyCost = numRegs;
-  let AllocationPriority = !sub(numRegs, 1);
+
+  // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the 
+  // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result 
+  // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for 
+  // regsters with numRegs 17+ we give SizePriority of 15. In  practice, there is only one 
+  // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, 
+  // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. 
+  defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
+  
+  let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor));
   let Weight = numRegs;
 }
 
 // Define a register tuple class, along with one requiring an even
 // aligned base register.
 multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
-  let HasVGPR = 1 in {
+  let HasVGPR = 1, BaseClassPriority = 1 in {
     // Define the regular class.
     def "" : VRegClassBase<numRegs, regTypes, regList> {
       let BaseClassOrder = !mul(numRegs, 32);
@@ -981,7 +1007,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
 }
 
 multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
-  let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
+  let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in {
     // Define the regular class.
     def "" : VRegClassBase<numRegs, regTypes, regList> {
       let BaseClassOrder = !mul(numRegs, 32);
@@ -1066,6 +1092,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6
 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
   let HasVGPR = 1;
   let HasAGPR = 1;
+  let BaseClassPriority = 0;
   let Size = 32;
 }
 } // End GeneratePressureSet = 0
@@ -1074,7 +1101,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
 // aligned base register.
 multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
                       dag vregList,  dag aregList> {
-  let HasVGPR = 1, HasAGPR = 1 in {
+  let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in {
     // Define the regular class.
     def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b5b3cc9..83e63ac 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) {
 }
 
 bool isAsyncStore(unsigned Opc) {
-  return false; // placeholder before async store implementation.
+  return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 ||
+         Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 ||
+         Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 ||
+         Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 ||
+         Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 ||
+         Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 ||
+         Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 ||
+         Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250;
 }
 
 bool isTensorStore(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c812dc9..95fcd4a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1236,6 +1236,12 @@ let isCommutable = 1, isReMaterializable = 1 in {
     defm V_PK_MIN_NUM_BF16 : VOP3PInst<"v_pk_min_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fminnum_like>;
     defm V_PK_MAX_NUM_BF16 : VOP3PInst<"v_pk_max_num_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, fmaxnum_like>;
     defm V_PK_FMA_BF16     : VOP3PInst<"v_pk_fma_bf16", VOP3P_Profile<VOP_V2BF16_V2BF16_V2BF16_V2BF16, VOP3_PACKED>, any_fma>;
+
+    // Scalar pseudo used to emulate AMDGPUClamp.
+    // Expanded to V_PK_MAX_NUM_BF16 with unused high half.
+    // FIXME-TRUE16: Pseudo expansion of this won't work with True16.
+    let True16Predicate = UseFakeTrue16Insts in
+    defm V_MAX_BF16_PSEUDO : VOP3Inst <"v_max_bf16", VOP_BF16_BF16_BF16>;
   }
 } // End isCommutable = 1, isReMaterializable = 1
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 50217c3..9e4dbec 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl(
     // instructions).
     if (Latency > 0 && Subtarget.isThumb2()) {
       const MachineFunction *MF = DefMI.getParent()->getParent();
-      // FIXME: Use Function::hasOptSize().
-      if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize))
+      if (MF->getFunction().hasOptSize())
         --Latency;
     }
     return Latency;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fca5dff..066b392 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FMINNUM, VT, Legal);
       setOperationAction(ISD::FMAXNUM, VT, Legal);
       setOperationAction(ISD::FROUND, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
@@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
 
     setOperationAction(ISD::FROUND, MVT::f16, Legal);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
+    setOperationAction(ISD::FRINT, MVT::f16, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f16, Legal);
   }
 
   if (Subtarget->hasNEON()) {
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ec6f4e2..ece6c10 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12327,7 +12327,7 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
   }
 
   assert(Section && "must have section to emit alignment");
-  if (Section->useCodeAlign())
+  if (getContext().getAsmInfo()->useCodeAlign(*Section))
     getStreamer().emitCodeAlignment(Align(2), &getSTI());
   else
     getStreamer().emitValueToAlignment(Align(2));
@@ -12525,7 +12525,7 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
     // '.align' is target specifically handled to mean 2**2 byte alignment.
     const MCSection *Section = getStreamer().getCurrentSectionOnly();
     assert(Section && "must have section to emit alignment");
-    if (Section->useCodeAlign())
+    if (getContext().getAsmInfo()->useCodeAlign(*Section))
       getStreamer().emitCodeAlignment(Align(4), &getSTI(), 0);
     else
       getStreamer().emitValueToAlignment(Align(4), 0, 1, 0);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index a7a9911..6dfe846 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -708,8 +708,6 @@ private:
   void SwitchToExTabSection(const MCSymbol &FnStart);
   void SwitchToExIdxSection(const MCSymbol &FnStart);
 
-  void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
-
   bool IsThumb;
   bool IsAndroid;
 
@@ -1096,8 +1094,8 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
 }
 
 void ARMTargetELFStreamer::annotateTLSDescriptorSequence(
-    const MCSymbolRefExpr *S) {
-  getStreamer().EmitFixup(S, FK_Data_4);
+    const MCSymbolRefExpr *Expr) {
+  getStreamer().addFixup(Expr, FK_Data_4);
 }
 
 void ARMTargetELFStreamer::emitCode16() { getStreamer().setIsThumb(true); }
@@ -1140,7 +1138,8 @@ void ARMTargetELFStreamer::finish() {
   MCContext &Ctx = getContext();
   auto &Asm = getStreamer().getAssembler();
   if (any_of(Asm, [](const MCSection &Sec) {
-        return cast<MCSectionELF>(Sec).getFlags() & ELF::SHF_ARM_PURECODE;
+        return static_cast<const MCSectionELF &>(Sec).getFlags() &
+               ELF::SHF_ARM_PURECODE;
       })) {
     auto *Text =
         static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
@@ -1206,11 +1205,6 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
                     SectionKind::getData(), FnStart);
 }
 
-void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
-  MCFragment *Frag = getCurrentFragment();
-  Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind));
-}
-
 void ARMELFStreamer::EHReset() {
   ExTab = nullptr;
   FnStart = nullptr;
@@ -1290,14 +1284,11 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
 // Add the R_ARM_NONE fixup at the same position
 void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
   const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
+  visitUsedSymbol(*PersonalitySym);
 
   const MCSymbolRefExpr *PersonalityRef =
       MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
-
-  visitUsedExpr(*PersonalityRef);
-  MCFragment *DF = getCurrentFragment();
-  DF->addFixup(
-      MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4));
+  addFixup(PersonalityRef, FK_Data_4);
 }
 
 void ARMELFStreamer::FlushPendingOffset() {
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index ad8aa571..0fb33cd 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -260,7 +260,7 @@ bool AVRAsmPrinter::doFinalization(Module &M) {
       continue;
     }
 
-    auto *Section = cast<MCSectionELF>(TLOF.SectionForGlobal(&GO, TM));
+    auto *Section = static_cast<MCSectionELF *>(TLOF.SectionForGlobal(&GO, TM));
     if (Section->getName().starts_with(".data"))
       NeedsCopyData = true;
     else if (Section->getName().starts_with(".rodata") && SubTM->hasLPM())
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index c2c1bb4..0615ec9 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -24,8 +24,6 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
   CalleeSaveStackSlotSize = 2;
   CommentString = ";";
   SeparatorString = "$";
-  PrivateGlobalPrefix = ".L";
-  PrivateLabelPrefix = ".L";
   UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
 }
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index fab2713..1915fa8 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -14,7 +14,7 @@
 #define LLVM_AVR_ASM_INFO_H
 
 #include "MCTargetDesc/AVRMCExpr.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCExpr.h"
 
 namespace llvm {
@@ -22,7 +22,7 @@ namespace llvm {
 class Triple;
 
 /// Specifies the format of AVR assembly files.
-class AVRMCAsmInfo : public MCAsmInfo {
+class AVRMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
   void printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 1e29a0f..bed6bc9 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
     return;
   }
 
-  // MapDef type may be a struct type or a non-pointer derived type
-  const DIType *OrigTy = Ty;
-  while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
-    auto Tag = DTy->getTag();
-    if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
-        Tag != dwarf::DW_TAG_volatile_type &&
-        Tag != dwarf::DW_TAG_restrict_type)
-      break;
-    Ty = DTy->getBaseType();
-  }
-
-  const auto *CTy = dyn_cast<DICompositeType>(Ty);
-  if (!CTy)
-    return;
-
-  auto Tag = CTy->getTag();
-  if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl())
-    return;
-
-  // Visit all struct members to ensure their types are visited.
-  const DINodeArray Elements = CTy->getElements();
-  for (const auto *Element : Elements) {
-    const auto *MemberType = cast<DIDerivedType>(Element);
-    const DIType *MemberBaseType = MemberType->getBaseType();
-
-    // If the member is a composite type, that may indicate the currently
-    // visited composite type is a wrapper, and the member represents the
-    // actual map definition.
-    // In that case, visit the member with `visitMapDefType` instead of
-    // `visitTypeEntry`, treating it specifically as a map definition rather
-    // than as a regular composite type.
-    const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType);
-    if (MemberCTy) {
-      visitMapDefType(MemberBaseType, TypeId);
-    } else {
-      visitTypeEntry(MemberBaseType);
+  uint32_t TmpId;
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_volatile_type:
+  case dwarf::DW_TAG_restrict_type:
+  case dwarf::DW_TAG_pointer_type:
+    visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId);
+    break;
+  case dwarf::DW_TAG_array_type:
+    // Visit nested map array and jump to the element type
+    visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId);
+    break;
+  case dwarf::DW_TAG_structure_type: {
+    // Visit all struct members to ensure their types are visited.
+    const auto *CTy = cast<DICompositeType>(Ty);
+    const DINodeArray Elements = CTy->getElements();
+    for (const auto *Element : Elements) {
+      const auto *MemberType = cast<DIDerivedType>(Element);
+      const DIType *MemberBaseType = MemberType->getBaseType();
+      // If the member is a composite type, that may indicate the currently
+      // visited composite type is a wrapper, and the member represents the
+      // actual map definition.
+      // In that case, visit the member with `visitMapDefType` instead of
+      // `visitTypeEntry`, treating it specifically as a map definition rather
+      // than as a regular composite type.
+      const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType);
+      if (MemberCTy) {
+        visitMapDefType(MemberBaseType, TmpId);
+      } else {
+        visitTypeEntry(MemberBaseType);
+      }
     }
+    break;
+  }
+  default:
+    break;
   }
 
   // Visit this type, struct or a const/typedef/volatile/restrict type
-  visitTypeEntry(OrigTy, TypeId, false, false);
+  visitTypeEntry(Ty, TypeId, false, false);
 }
 
 /// Read file contents from the actual file or from the source
@@ -1255,10 +1255,8 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
   FuncInfo.Label = FuncLabel;
   FuncInfo.TypeId = FuncTypeId;
   if (FuncLabel->isInSection()) {
-    MCSection &Section = FuncLabel->getSection();
-    const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
-    assert(SectionELF && "Null section for Function Label");
-    SecNameOff = addString(SectionELF->getName());
+    auto &Sec = static_cast<const MCSectionELF &>(FuncLabel->getSection());
+    SecNameOff = addString(Sec.getName());
   } else {
     SecNameOff = addString(".text");
   }
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 827e928..bb74f6a 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -54,11 +54,8 @@ unsigned BPFELFObjectWriter::getRelocType(const MCFixup &Fixup,
       const MCSymbol &Sym = *A;
 
       if (Sym.isDefined()) {
-        MCSection &Section = Sym.getSection();
-        const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
-        assert(SectionELF && "Null section for reloc symbol");
-
-        unsigned Flags = SectionELF->getFlags();
+        auto &Section = static_cast<const MCSectionELF &>(Sym.getSection());
+        unsigned Flags = Section.getFlags();
 
         if (Sym.isTemporary()) {
           // .BTF.ext generates FK_Data_4 relocations for
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index 7b21684..63d6e6f 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -13,18 +13,19 @@
 #ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
 #define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
 
-class BPFMCAsmInfo : public MCAsmInfo {
+class BPFMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit BPFMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
     if (TT.getArch() == Triple::bpfeb)
       IsLittleEndian = false;
 
     PrivateGlobalPrefix = ".L";
+    PrivateLabelPrefix = "L";
     WeakRefDirective = "\t.weak\t";
 
     UsesELFSectionDirectiveForBSS = true;
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index c86fa2b..54c3cea 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
   TII = ST->getInstrInfo();
 
   const Function &F = MF.getFunction();
-  bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize);
+  bool OptForSize = F.hasOptSize();
 
   // Combine aggressively (for code size)
   ShouldCombineAggressively =
diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp
index 6eccf80..9d7776d 100644
--- a/llvm/lib/Target/Hexagon/HexagonMask.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp
@@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) {
   HII = HST.getInstrInfo();
   const Function &F = MF.getFunction();
 
-  if (!F.hasFnAttribute(Attribute::OptimizeForSize))
+  if (!F.hasOptSize())
     return false;
   // Mask instruction is available only from v66
   if (!HST.hasV66Ops())
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e915a3c4..d96136c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2385,23 +2385,6 @@ SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op,
   return Res;
 }
 
-static bool isConstantOrUndef(const SDValue Op) {
-  if (Op->isUndef())
-    return true;
-  if (isa<ConstantSDNode>(Op))
-    return true;
-  if (isa<ConstantFPSDNode>(Op))
-    return true;
-  return false;
-}
-
-static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
-  for (unsigned i = 0; i < Op->getNumOperands(); ++i)
-    if (isConstantOrUndef(Op->getOperand(i)))
-      return true;
-  return false;
-}
-
 // Lower BUILD_VECTOR as broadcast load (if possible).
 // For example:
 //   %a = load i8, ptr %ptr
@@ -2451,10 +2434,14 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
   BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
   EVT ResTy = Op->getValueType(0);
+  unsigned NumElts = ResTy.getVectorNumElements();
   SDLoc DL(Op);
   APInt SplatValue, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
+  bool IsConstant = false;
+  bool UseSameConstant = true;
+  SDValue ConstantValue;
   bool Is128Vec = ResTy.is128BitVector();
   bool Is256Vec = ResTy.is256BitVector();
 
@@ -2505,20 +2492,45 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
   if (DAG.isSplatValue(Op, /*AllowUndefs=*/false))
     return Op;
 
-  if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Opi = Node->getOperand(i);
+    if (isIntOrFPConstant(Opi)) {
+      IsConstant = true;
+      if (!ConstantValue.getNode())
+        ConstantValue = Opi;
+      else if (ConstantValue != Opi)
+        UseSameConstant = false;
+    }
+  }
+
+  // If the type of BUILD_VECTOR is v2f64, custom legalizing it has no benefits.
+  if (IsConstant && UseSameConstant && ResTy != MVT::v2f64) {
+    SDValue Result = DAG.getSplatBuildVector(ResTy, DL, ConstantValue);
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue Opi = Node->getOperand(i);
+      if (!isIntOrFPConstant(Opi))
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Result, Opi,
+                             DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
+    }
+    return Result;
+  }
+
+  if (!IsConstant) {
     // Use INSERT_VECTOR_ELT operations rather than expand to stores.
     // The resulting code is the same length as the expansion, but it doesn't
     // use memory operations.
-    EVT ResTy = Node->getValueType(0);
-
     assert(ResTy.isVector());
 
-    unsigned NumElts = ResTy.getVectorNumElements();
-    SDValue Vector =
-        DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Node->getOperand(0));
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Vector = DAG.getUNDEF(ResTy);
+
+    if (!Op0.isUndef())
+      Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0);
     for (unsigned i = 1; i < NumElts; ++i) {
-      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
-                           Node->getOperand(i),
+      SDValue Opi = Node->getOperand(i);
+      if (Opi.isUndef())
+        continue;
+      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi,
                            DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
     }
     return Vector;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 8fa72bc..d9ea88c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -254,6 +254,7 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
   MCFixup Fixup =
       MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN);
   F.setVarFixups({Fixup});
+  F.setLinkerRelaxable();
   F.getParent()->setLinkerRelaxable();
   return true;
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index feb4eb3..7a8395a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -969,7 +969,7 @@ void MipsTargetELFStreamer::finish() {
 
       Align Alignment = Section.getAlign();
       S.switchSection(&Section);
-      if (Section.useCodeAlign())
+      if (getContext().getAsmInfo()->useCodeAlign(Section))
         S.emitCodeAlignment(Alignment, &STI, Alignment.value());
       else
         S.emitValueToAlignment(Alignment, 0, 1, Alignment.value());
@@ -1034,12 +1034,14 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
 
 void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) {
   auto &S = getStreamer();
+  S.ensureHeadroom(4);
   S.addFixup(Value, Mips::fixup_Mips_GPREL32);
   S.appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
   auto &S = getStreamer();
+  S.ensureHeadroom(8);
   // fixup_Mips_GPREL32 desginates R_MIPS_GPREL32+R_MIPS_64 on MIPS64.
   S.addFixup(Value, Mips::fixup_Mips_GPREL32);
   S.appendContents(8, 0);
@@ -1047,24 +1049,28 @@ void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) {
 
 void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) {
   auto &S = getStreamer();
+  S.ensureHeadroom(4);
   S.addFixup(Value, Mips::fixup_Mips_DTPREL32);
   S.appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) {
   auto &S = getStreamer();
+  S.ensureHeadroom(8);
   S.addFixup(Value, Mips::fixup_Mips_DTPREL64);
   S.appendContents(8, 0);
 }
 
 void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) {
   auto &S = getStreamer();
+  S.ensureHeadroom(4);
   S.addFixup(Value, Mips::fixup_Mips_TPREL32);
   S.appendContents(4, 0);
 }
 
 void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) {
   auto &S = getStreamer();
+  S.ensureHeadroom(8);
   S.addFixup(Value, Mips::fixup_Mips_TPREL64);
   S.appendContents(8, 0);
 }
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index ca03310..a2e48ab 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -737,14 +737,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
     if (FS.empty() && M.size() && F->hasFnAttribute("target-features"))
       FS = F->getFnAttribute("target-features").getValueAsString();
 
+    std::string strFS = FS.str();
+    if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool())
+      strFS += strFS.empty() ? "+soft-float" : ",+soft-float";
+
     // Compute MIPS architecture attributes based on the default subtarget
     // that we'd have constructed.
     // FIXME: For ifunc related functions we could iterate over and look
     // for a feature string that doesn't match the default one.
     StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
     const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
-    const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM,
-                            std::nullopt);
+    const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(),
+                            MTM, std::nullopt);
 
     bool IsABICalls = STI.isABICalls();
     const MipsABIInfo &ABI = MTM.getABI();
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0e581a7..ec6b382 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
-  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
-
   setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND,
                        ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL,
                        ISD::SIGN_EXTEND});
@@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::FP_TO_SINT:         return lowerFP_TO_SINT(Op, DAG);
   case ISD::READCYCLECOUNTER:
     return lowerREADCYCLECOUNTER(Op, DAG);
-  case ISD::ConstantFP:
-    return lowerConstantFP(Op, DAG);
   }
   return SDValue();
 }
@@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
   return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
 }
 
-SDValue MipsTargetLowering::lowerConstantFP(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getSimpleValueType();
-  SDNode *N = Op.getNode();
-  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N);
-
-  if (!CFP->isNaN() || Subtarget.isNaN2008()) {
-    return SDValue();
-  }
-
-  APFloat NaNValue = CFP->getValueAPF();
-  auto &Sem = NaNValue.getSemantics();
-
-  // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN
-  // encodings, and one for sNaNs. Check every NaN constants and make sure
-  // they are correctly encoded for legacy encodings.
-  if (!NaNValue.isSignaling()) {
-    APFloat RealQNaN = NaNValue.getSNaN(Sem);
-    return DAG.getConstantFP(RealQNaN, DL, VT);
-  }
-  return SDValue();
-}
-
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 31ac5d4..c65c76c 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -592,7 +592,6 @@ class TargetRegisterClass;
     SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
 
     /// isEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 614b321..ce9cd12 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -15,8 +15,6 @@
 
 using namespace llvm;
 
-void NVPTXMCAsmInfo::anchor() {}
-
 NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple,
                                const MCTargetOptions &Options) {
   if (TheTriple.getArch() == Triple::nvptx64) {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 77c4dae..f071406 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -19,8 +19,6 @@ namespace llvm {
 class Triple;
 
 class NVPTXMCAsmInfo : public MCAsmInfo {
-  virtual void anchor();
-
 public:
   explicit NVPTXMCAsmInfo(const Triple &TheTriple,
                           const MCTargetOptions &Options);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 9f91143..329e3b5 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -97,10 +97,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
   if (isDwarfSection(FI, Section)) {
     // Emit DWARF .file directives in the outermost scope.
     outputDwarfFileDirectives();
-    OS << "\t.section";
-    Section->printSwitchToSection(*getStreamer().getContext().getAsmInfo(),
-                                  getStreamer().getContext().getTargetTriple(),
-                                  OS, SubSection);
+    OS << "\t.section\t" << Section->getName() << '\n';
     // DWARF sections are enclosed into braces - emit the open one.
     OS << "\t{\n";
     HasSections = true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 65e7c56..96f52275 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -145,18 +145,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryStoreVector(N))
       return;
     break;
-  case NVPTXISD::LoadParam:
-  case NVPTXISD::LoadParamV2:
-  case NVPTXISD::LoadParamV4:
-    if (tryLoadParam(N))
-      return;
-    break;
-  case NVPTXISD::StoreParam:
-  case NVPTXISD::StoreParamV2:
-  case NVPTXISD::StoreParamV4:
-    if (tryStoreParam(N))
-      return;
-    break;
   case ISD::INTRINSIC_W_CHAIN:
     if (tryIntrinsicChain(N))
       return;
@@ -1462,267 +1450,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   return true;
 }
 
-bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
-  SDValue Chain = Node->getOperand(0);
-  SDValue Offset = Node->getOperand(2);
-  SDValue Glue = Node->getOperand(3);
-  SDLoc DL(Node);
-  MemSDNode *Mem = cast<MemSDNode>(Node);
-
-  unsigned VecSize;
-  switch (Node->getOpcode()) {
-  default:
-    return false;
-  case NVPTXISD::LoadParam:
-    VecSize = 1;
-    break;
-  case NVPTXISD::LoadParamV2:
-    VecSize = 2;
-    break;
-  case NVPTXISD::LoadParamV4:
-    VecSize = 4;
-    break;
-  }
-
-  EVT EltVT = Node->getValueType(0);
-  EVT MemVT = Mem->getMemoryVT();
-
-  std::optional<unsigned> Opcode;
-
-  switch (VecSize) {
-  default:
-    return false;
-  case 1:
-    Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
-                             NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
-                             NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64);
-    break;
-  case 2:
-    Opcode =
-        pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
-                        NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
-                        NVPTX::LoadParamMemV2I64);
-    break;
-  case 4:
-    Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
-                             NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16,
-                             NVPTX::LoadParamMemV4I32, {/* no v4i64 */});
-    break;
-  }
-  if (!Opcode)
-    return false;
-
-  SDVTList VTs;
-  if (VecSize == 1) {
-    VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
-  } else if (VecSize == 2) {
-    VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
-  } else {
-    EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
-    VTs = CurDAG->getVTList(EVTs);
-  }
-
-  unsigned OffsetVal = Offset->getAsZExtVal();
-
-  SmallVector<SDValue, 2> Ops(
-      {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
-  ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
-  return true;
-}
-
-// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
-#define getOpcV2H(ty, opKind0, opKind1)                                        \
-  NVPTX::StoreParamV2##ty##_##opKind0##opKind1
-
-#define getOpcV2H1(ty, opKind0, isImm1)                                        \
-  (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
-
-#define getOpcodeForVectorStParamV2(ty, isimm)                                 \
-  (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
-
-#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3)                      \
-  NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
-
-#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3)                      \
-  (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i)                       \
-           : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
-
-#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3)                       \
-  (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3)                       \
-           : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
-
-#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3)                        \
-  (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3)                        \
-           : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
-
-#define getOpcodeForVectorStParamV4(ty, isimm)                                 \
-  (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3])                 \
-             : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
-
-#define getOpcodeForVectorStParam(n, ty, isimm)                                \
-  (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm)                            \
-           : getOpcodeForVectorStParamV4(ty, isimm)
-
-static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,
-                                           unsigned NumElts,
-                                           MVT::SimpleValueType MemTy,
-                                           SelectionDAG *CurDAG, SDLoc DL) {
-  // Determine which inputs are registers and immediates make new operators
-  // with constant values
-  SmallVector<bool, 4> IsImm(NumElts, false);
-  for (unsigned i = 0; i < NumElts; i++) {
-    IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
-    if (IsImm[i]) {
-      SDValue Imm = Ops[i];
-      if (MemTy == MVT::f32 || MemTy == MVT::f64) {
-        const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
-        const ConstantFP *CF = ConstImm->getConstantFPValue();
-        Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
-      } else {
-        const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
-        const ConstantInt *CI = ConstImm->getConstantIntValue();
-        Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
-      }
-      Ops[i] = Imm;
-    }
-  }
-
-  // Get opcode for MemTy, size, and register/immediate operand ordering
-  switch (MemTy) {
-  case MVT::i8:
-    return getOpcodeForVectorStParam(NumElts, I8, IsImm);
-  case MVT::i16:
-    return getOpcodeForVectorStParam(NumElts, I16, IsImm);
-  case MVT::i32:
-    return getOpcodeForVectorStParam(NumElts, I32, IsImm);
-  case MVT::i64:
-    assert(NumElts == 2 && "MVT too large for NumElts > 2");
-    return getOpcodeForVectorStParamV2(I64, IsImm);
-  case MVT::f32:
-    return getOpcodeForVectorStParam(NumElts, F32, IsImm);
-  case MVT::f64:
-    assert(NumElts == 2 && "MVT too large for NumElts > 2");
-    return getOpcodeForVectorStParamV2(F64, IsImm);
-
-  // These cases don't support immediates, just use the all register version
-  // and generate moves.
-  case MVT::i1:
-    return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
-                          : NVPTX::StoreParamV4I8_rrrr;
-  case MVT::f16:
-  case MVT::bf16:
-    return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
-                          : NVPTX::StoreParamV4I16_rrrr;
-  case MVT::v2f16:
-  case MVT::v2bf16:
-  case MVT::v2i16:
-  case MVT::v4i8:
-    return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
-                          : NVPTX::StoreParamV4I32_rrrr;
-  default:
-    llvm_unreachable("Cannot select st.param for unknown MemTy");
-  }
-}
-
-bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
-  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  SDValue Param = N->getOperand(1);
-  unsigned ParamVal = Param->getAsZExtVal();
-  SDValue Offset = N->getOperand(2);
-  unsigned OffsetVal = Offset->getAsZExtVal();
-  MemSDNode *Mem = cast<MemSDNode>(N);
-  SDValue Glue = N->getOperand(N->getNumOperands() - 1);
-
-  // How many elements do we have?
-  unsigned NumElts;
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Unexpected opcode");
-  case NVPTXISD::StoreParam:
-    NumElts = 1;
-    break;
-  case NVPTXISD::StoreParamV2:
-    NumElts = 2;
-    break;
-  case NVPTXISD::StoreParamV4:
-    NumElts = 4;
-    break;
-  }
-
-  // Build vector of operands
-  SmallVector<SDValue, 8> Ops;
-  for (unsigned i = 0; i < NumElts; ++i)
-    Ops.push_back(N->getOperand(i + 3));
-  Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
-              CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
-
-  // Determine target opcode
-  // If we have an i1, use an 8-bit store. The lowering code in
-  // NVPTXISelLowering will have already emitted an upcast.
-  std::optional<unsigned> Opcode;
-  switch (NumElts) {
-  default:
-    llvm_unreachable("Unexpected NumElts");
-  case 1: {
-    MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
-    SDValue Imm = Ops[0];
-    if (MemTy != MVT::f16 && MemTy != MVT::bf16 &&
-        (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
-      // Convert immediate to target constant
-      if (MemTy == MVT::f32 || MemTy == MVT::f64) {
-        const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
-        const ConstantFP *CF = ConstImm->getConstantFPValue();
-        Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
-      } else {
-        const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
-        const ConstantInt *CI = ConstImm->getConstantIntValue();
-        Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
-      }
-      Ops[0] = Imm;
-      // Use immediate version of store param
-      Opcode =
-          pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i,
-                          NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i);
-    } else
-      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
-                               NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
-                               NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r);
-    if (Opcode == NVPTX::StoreParamI8_r) {
-      // Fine tune the opcode depending on the size of the operand.
-      // This helps to avoid creating redundant COPY instructions in
-      // InstrEmitter::AddRegisterOperand().
-      switch (Ops[0].getSimpleValueType().SimpleTy) {
-      default:
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamI8TruncI32_r;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::StoreParamI8TruncI64_r;
-        break;
-      }
-    }
-    break;
-  }
-  case 2:
-  case 4: {
-    MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
-    Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
-    break;
-  }
-  }
-
-  SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
-  SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
-
-  ReplaceNode(N, Ret);
-  return true;
-}
-
 /// SelectBFE - Look for instruction sequences that can be made more efficient
 /// by using the 'bfe' (bit-field extract) PTX instruction
 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index b99b4ef..e504a8f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -78,8 +78,6 @@ private:
   bool tryLDG(MemSDNode *N);
   bool tryStore(SDNode *N);
   bool tryStoreVector(SDNode *N);
-  bool tryLoadParam(SDNode *N);
-  bool tryStoreParam(SDNode *N);
   bool tryFence(SDNode *N);
   void SelectAddrSpaceCast(SDNode *N);
   bool tryBFE(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ddcecc00..f79b862 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1075,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::DeclareArrayParam)
     MAKE_CASE(NVPTXISD::DeclareScalarParam)
     MAKE_CASE(NVPTXISD::CALL)
-    MAKE_CASE(NVPTXISD::LoadParam)
-    MAKE_CASE(NVPTXISD::LoadParamV2)
-    MAKE_CASE(NVPTXISD::LoadParamV4)
-    MAKE_CASE(NVPTXISD::StoreParam)
-    MAKE_CASE(NVPTXISD::StoreParamV2)
-    MAKE_CASE(NVPTXISD::StoreParamV4)
     MAKE_CASE(NVPTXISD::MoveParam)
     MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
     MAKE_CASE(NVPTXISD::BUILD_VECTOR)
@@ -1318,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
   return DL.getABITypeAlign(Ty);
 }
 
-static bool adjustElementType(EVT &ElementType) {
-  switch (ElementType.getSimpleVT().SimpleTy) {
-  default:
-    return false;
-  case MVT::f16:
-  case MVT::bf16:
-    ElementType = MVT::i16;
-    return true;
-  case MVT::f32:
-  case MVT::v2f16:
-  case MVT::v2bf16:
-    ElementType = MVT::i32;
-    return true;
-  case MVT::f64:
-    ElementType = MVT::i64;
-    return true;
-  }
-}
-
-// Use byte-store when the param address of the argument value is unaligned.
-// This may happen when the return value is a field of a packed structure.
-//
-// This is called in LowerCall() when passing the param values.
-static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
-                                        uint64_t Offset, EVT ElementType,
-                                        SDValue StVal, SDValue &InGlue,
-                                        unsigned ArgID, const SDLoc &dl) {
-  // Bit logic only works on integer types
-  if (adjustElementType(ElementType))
-    StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
-
-  // Store each byte
-  SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
-    // Shift the byte to the last byte position
-    SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
-                                   DAG.getConstant(i * 8, dl, MVT::i32));
-    SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
-                               DAG.getConstant(Offset + i, dl, MVT::i32),
-                               ShiftVal, InGlue};
-    // Trunc store only the last byte by using
-    //     st.param.b8
-    // The register type can be larger than b8.
-    Chain = DAG.getMemIntrinsicNode(
-        NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
-        MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
-    InGlue = Chain.getValue(1);
-  }
-  return Chain;
-}
-
-// Use byte-load when the param adress of the returned value is unaligned.
-// This may happen when the returned value is a field of a packed structure.
-static SDValue
-LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
-                           EVT ElementType, SDValue &InGlue,
-                           SmallVectorImpl<SDValue> &TempProxyRegOps,
-                           const SDLoc &dl) {
-  // Bit logic only works on integer types
-  EVT MergedType = ElementType;
-  adjustElementType(MergedType);
-
-  // Load each byte and construct the whole value. Initial value to 0
-  SDValue RetVal = DAG.getConstant(0, dl, MergedType);
-  // LoadParamMemI8 loads into i16 register only
-  SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
-  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
-    SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                              DAG.getConstant(Offset + i, dl, MVT::i32),
-                              InGlue};
-    // This will be selected to LoadParamMemI8
-    SDValue LdVal =
-        DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
-                                MVT::i8, MachinePointerInfo(), Align(1));
-    SDValue TmpLdVal = LdVal.getValue(0);
-    Chain = LdVal.getValue(1);
-    InGlue = LdVal.getValue(2);
-
-    TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
-                           TmpLdVal.getSimpleValueType(), TmpLdVal);
-    TempProxyRegOps.push_back(TmpLdVal);
-
-    SDValue CMask = DAG.getConstant(255, dl, MergedType);
-    SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
-    // Need to extend the i16 register to the whole width.
-    TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
-    // Mask off the high bits. Leave only the lower 8bits.
-    // Do this because we are using loadparam.b8.
-    TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
-    // Shift and merge
-    TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
-    RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
-  }
-  if (ElementType != MergedType)
-    RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
-
-  return RetVal;
-}
-
 static bool shouldConvertToIndirectCall(const CallBase *CB,
                                         const GlobalAddressSDNode *Func) {
   if (!Func)
@@ -1483,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SelectionDAG &DAG = CLI.DAG;
   SDLoc dl = CLI.DL;
-  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
-  SDValue Chain = CLI.Chain;
+  const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   SDValue Callee = CLI.Callee;
-  bool &isTailCall = CLI.IsTailCall;
   ArgListTy &Args = CLI.getArgs();
   Type *RetTy = CLI.RetTy;
   const CallBase *CB = CLI.CB;
@@ -1496,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     return DAG.getConstant(I, dl, MVT::i32);
   };
 
+  const unsigned UniqueCallSite = GlobalUniqueCallSite++;
+  const SDValue CallChain = CLI.Chain;
+  const SDValue StartChain =
+      DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
+  SDValue DeclareGlue = StartChain.getValue(1);
+
+  SmallVector<SDValue, 16> CallPrereqs{StartChain};
+
+  const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
+    // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
+    // loaded/stored using i16, so it's handled here as well.
+    const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
+    SDValue Declare =
+        DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
+                    {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
+    CallPrereqs.push_back(Declare);
+    DeclareGlue = Declare.getValue(1);
+    return Declare;
+  };
+
+  const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
+                                         unsigned Size) {
+    SDValue Declare = DAG.getNode(
+        NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
+        {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
+    CallPrereqs.push_back(Declare);
+    DeclareGlue = Declare.getValue(1);
+    return Declare;
+  };
+
   // Variadic arguments.
   //
   // Normally, for each argument, we declare a param scalar or a param
@@ -1511,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   //
   // After all vararg is processed, 'VAOffset' holds the size of the
   // vararg byte array.
+  assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
+         "Non-VarArg function with extra arguments");
 
-  SDValue VADeclareParam;                 // vararg byte array
   const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
-  unsigned VAOffset = 0;                  // current offset in the param array
+  unsigned VAOffset = 0; // current offset in the param array
 
-  const unsigned UniqueCallSite = GlobalUniqueCallSite++;
-  SDValue TempChain = Chain;
-  Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
-  SDValue InGlue = Chain.getValue(1);
+  const SDValue VADeclareParam =
+      CLI.Args.size() > FirstVAArg
+          ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
+                                  Align(STI.getMaxRequiredAlignment()), 0)
+          : SDValue();
 
   // Args.size() and Outs.size() need not match.
   // Outs.size() will be larger
@@ -1580,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
            "type size mismatch");
 
-    const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> {
-      if (IsVAArg) {
-        if (ArgI == FirstVAArg) {
-          VADeclareParam = DAG.getNode(
-              NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
-              {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()),
-               GetI32(0), InGlue});
-          return VADeclareParam;
-        }
-        return std::nullopt;
-      }
-      if (IsByVal || shouldPassAsArray(Arg.Ty)) {
-        // declare .param .align <align> .b8 .param<n>[<size>];
-        return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
-                           {MVT::Other, MVT::Glue},
-                           {Chain, ParamSymbol, GetI32(ArgAlign.value()),
-                            GetI32(TypeSize), InGlue});
-      }
+    const SDValue ArgDeclare = [&]() {
+      if (IsVAArg)
+        return VADeclareParam;
+
+      if (IsByVal || shouldPassAsArray(Arg.Ty))
+        return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize);
+
       assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
-      // declare .param .b<size> .param<n>;
-
-      // PTX ABI requires integral types to be at least 32 bits in
-      // size. FP16 is loaded/stored using i16, so it's handled
-      // here as well.
-      const unsigned PromotedSize =
-          (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint())
-              ? promoteScalarArgumentSize(TypeSize * 8)
-              : TypeSize * 8;
-
-      return DAG.getNode(NVPTXISD::DeclareScalarParam, dl,
-                         {MVT::Other, MVT::Glue},
-                         {Chain, ParamSymbol, GetI32(PromotedSize), InGlue});
+      assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
+             "Only int and float types are supported as non-array arguments");
+
+      return MakeDeclareScalarParam(ParamSymbol, TypeSize);
     }();
-    if (ArgDeclare) {
-      Chain = ArgDeclare->getValue(0);
-      InGlue = ArgDeclare->getValue(1);
-    }
 
     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
     // than 32-bits are sign extended or zero extended, depending on
@@ -1626,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
 
     const auto GetStoredValue = [&](const unsigned I, EVT EltVT,
-                                    const Align PartAlign) {
-      SDValue StVal;
+                                    const MaybeAlign PartAlign) {
       if (IsByVal) {
         SDValue Ptr = ArgOutVals[0];
         auto MPI = refinePtrAS(Ptr, DAG, DL, *this);
         SDValue SrcAddr =
             DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I]));
 
-        StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign);
-      } else {
-        StVal = ArgOutVals[I];
-
-        auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType());
-        if (PromotedVT != StVal.getValueType()) {
-          StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT,
-                              StVal);
-        }
+        return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign);
       }
+      SDValue StVal = ArgOutVals[I];
+      assert(promoteScalarIntegerPTX(StVal.getValueType()) ==
+                 StVal.getValueType() &&
+             "OutVal type should always be legal");
 
-      if (ExtendIntegerParam) {
-        assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
-        // zext/sext to i32
-        StVal =
-            DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal);
-      } else if (EltVT.getSizeInBits() < 16) {
-        // Use 16-bit registers for small stores as it's the
-        // smallest general purpose register size supported by NVPTX.
-        StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
-      }
-      return StVal;
+      const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+      const EVT StoreVT =
+          ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
+
+      return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
     };
 
     const auto VectorInfo =
@@ -1664,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     unsigned J = 0;
     for (const unsigned NumElts : VectorInfo) {
       const int CurOffset = Offsets[J];
-      EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
-      const Align PartAlign = commonAlignment(ArgAlign, CurOffset);
-
-      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
-      // scalar store. In such cases, fall back to byte stores.
-      if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) {
-
-        SDValue StVal = GetStoredValue(J, EltVT, PartAlign);
-        Chain = LowerUnalignedStoreParam(DAG, Chain,
-                                         CurOffset + (IsByVal ? VAOffset : 0),
-                                         EltVT, StVal, InGlue, ArgI, dl);
-
-        // LowerUnalignedStoreParam took care of inserting the necessary nodes
-        // into the SDAG, so just move on to the next element.
-        J++;
-        continue;
-      }
+      const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
 
       if (IsVAArg && !IsByVal)
         // Align each part of the variadic argument to their type.
@@ -1688,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       assert((IsVAArg || VAOffset == 0) &&
              "VAOffset must be 0 for non-VA args");
-      SmallVector<SDValue, 6> StoreOperands{
-          Chain, GetI32(IsVAArg ? FirstVAArg : ArgI),
-          GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))};
 
-      // Record the values to store.
-      for (const unsigned K : llvm::seq(NumElts))
-        StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign));
-      StoreOperands.push_back(InGlue);
+      const unsigned Offset =
+          (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset));
+      SDValue Ptr =
+          DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
 
-      NVPTXISD::NodeType Op;
-      switch (NumElts) {
-      case 1:
-        Op = NVPTXISD::StoreParam;
-        break;
-      case 2:
-        Op = NVPTXISD::StoreParamV2;
-        break;
-      case 4:
-        Op = NVPTXISD::StoreParamV4;
-        break;
-      default:
-        llvm_unreachable("Invalid vector info.");
+      const MaybeAlign CurrentAlign = ExtendIntegerParam
+                                          ? MaybeAlign(std::nullopt)
+                                          : commonAlignment(ArgAlign, Offset);
+
+      SDValue Val;
+      if (NumElts == 1) {
+        Val = GetStoredValue(J, EltVT, CurrentAlign);
+      } else {
+        SmallVector<SDValue, 8> StoreVals;
+        for (const unsigned K : llvm::seq(NumElts)) {
+          SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign);
+          if (ValJ.getValueType().isVector())
+            DAG.ExtractVectorElements(ValJ, StoreVals);
+          else
+            StoreVals.push_back(ValJ);
+        }
+
+        EVT VT = EVT::getVectorVT(
+            *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size());
+        Val = DAG.getBuildVector(VT, dl, StoreVals);
       }
-      // Adjust type of the store op if we've extended the scalar
-      // return value.
-      EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
 
-      Chain = DAG.getMemIntrinsicNode(
-          Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
-          TheStoreType, MachinePointerInfo(), PartAlign,
-          MachineMemOperand::MOStore);
-      InGlue = Chain.getValue(1);
+      SDValue StoreParam =
+          DAG.getStore(ArgDeclare, dl, Val, Ptr,
+                       MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
+      CallPrereqs.push_back(StoreParam);
 
       // TODO: We may need to support vector types that can be passed
       // as scalars in variadic arguments.
       if (IsVAArg && !IsByVal) {
         assert(NumElts == 1 &&
                "Vectorization is expected to be disabled for variadics.");
+        const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
         VAOffset +=
             DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext()));
       }
@@ -1736,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       VAOffset += TypeSize;
   }
 
-  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
-
   // Handle Result
   if (!Ins.empty()) {
-    const SDValue RetDeclare = [&]() {
-      const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
-      const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
-      if (shouldPassAsArray(RetTy)) {
-        const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
-        return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
-                           {MVT::Other, MVT::Glue},
-                           {Chain, RetSymbol, GetI32(RetAlign.value()),
-                            GetI32(ResultSize / 8), InGlue});
-      }
-      const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize);
-      return DAG.getNode(
-          NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
-          {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue});
-    }();
-    Chain = RetDeclare.getValue(0);
-    InGlue = RetDeclare.getValue(1);
+    const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+    const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
+    if (shouldPassAsArray(RetTy)) {
+      const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
+      MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
+    } else {
+      MakeDeclareScalarParam(RetSymbol, ResultSize);
+    }
   }
 
-  const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
   // Set the size of the vararg param byte array if the callee is a variadic
   // function and the variadic part is not empty.
-  if (HasVAArgs) {
+  if (VADeclareParam) {
     SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
                                  VADeclareParam.getOperand(1),
                                  VADeclareParam.getOperand(2), GetI32(VAOffset),
@@ -1771,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                     VADeclareParam->getVTList(), DeclareParamOps);
   }
 
+  const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
   // If the type of the callsite does not match that of the function, convert
   // the callsite to an indirect call.
   const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
@@ -1800,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // instruction.
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
+    const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
     std::string Proto =
         getPrototype(DL, RetTy, Args, CLI.Outs,
                      HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
                      UniqueCallSite);
     const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
-    Chain = DAG.getNode(
-        NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue},
-        {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue});
-    InGlue = Chain.getValue(1);
+    const SDValue PrototypeDeclare = DAG.getNode(
+        NVPTXISD::CallPrototype, dl, MVT::Other,
+        {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
+    CallPrereqs.push_back(PrototypeDeclare);
   }
 
   if (ConvertToIndirectCall) {
@@ -1826,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   const unsigned NumArgs =
       std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
   /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
-  ///      NumParams, Callee, Proto, InGlue)
-  Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue},
-                      {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
-                       GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee,
-                       GetI32(Proto), InGlue});
-  InGlue = Chain.getValue(1);
-
+  ///      NumParams, Callee, Proto)
+  const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
+  const SDValue Call = DAG.getNode(
+      NVPTXISD::CALL, dl, MVT::Other,
+      {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
+       GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
+
+  SmallVector<SDValue, 16> LoadChains{Call};
   SmallVector<SDValue, 16> ProxyRegOps;
-  // An item of the vector is filled if the element does not need a ProxyReg
-  // operation on it and should be added to InVals as is. ProxyRegOps and
-  // ProxyRegTruncates contain empty/none items at the same index.
-  SmallVector<SDValue, 16> RetElts;
-  // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
-  // to use the values of `LoadParam`s and to be replaced later then
-  // `CALLSEQ_END` is added.
-  SmallVector<SDValue, 16> TempProxyRegOps;
-
-  // Generate loads from param memory/moves from registers for result
   if (!Ins.empty()) {
     SmallVector<EVT, 16> VTs;
     SmallVector<uint64_t, 16> Offsets;
@@ -1860,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
     unsigned I = 0;
-    for (const unsigned VectorizedSize : VectorInfo) {
-      EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]);
-      EVT EltType = Ins[I].VT;
-      const Align EltAlign = commonAlignment(RetAlign, Offsets[I]);
-
-      if (TheLoadType != VTs[I])
-        EltType = TheLoadType;
-
-      if (ExtendIntegerRetVal) {
-        TheLoadType = MVT::i32;
-        EltType = MVT::i32;
-      } else if (TheLoadType.getSizeInBits() < 16) {
-        EltType = MVT::i16;
-      }
+    for (const unsigned NumElts : VectorInfo) {
+      const MaybeAlign CurrentAlign =
+          ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
+                              : commonAlignment(RetAlign, Offsets[I]);
 
-      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
-      // scalar load. In such cases, fall back to byte loads.
-      if (VectorizedSize == 1 && RetTy->isAggregateType() &&
-          EltAlign < DAG.getEVTAlign(TheLoadType)) {
-        SDValue Ret = LowerUnalignedLoadRetParam(
-            DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl);
-        ProxyRegOps.push_back(SDValue());
-        RetElts.resize(I);
-        RetElts.push_back(Ret);
-
-        I++;
-        continue;
-      }
+      const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
+      const EVT LoadVT =
+          ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
 
-      SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType);
-      LoadVTs.append({MVT::Other, MVT::Glue});
+      const unsigned PackingAmt =
+          LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
 
-      NVPTXISD::NodeType Op;
-      switch (VectorizedSize) {
-      case 1:
-        Op = NVPTXISD::LoadParam;
-        break;
-      case 2:
-        Op = NVPTXISD::LoadParamV2;
-        break;
-      case 4:
-        Op = NVPTXISD::LoadParamV4;
-        break;
-      default:
-        llvm_unreachable("Invalid vector info.");
-      }
+      const EVT VecVT = NumElts == 1 ? LoadVT
+                                     : EVT::getVectorVT(*DAG.getContext(),
+                                                        LoadVT.getScalarType(),
+                                                        NumElts * PackingAmt);
 
-      SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue};
-      SDValue RetVal = DAG.getMemIntrinsicNode(
-          Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
-          MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad);
+      const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+      SDValue Ptr =
+          DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
 
-      for (const unsigned J : llvm::seq(VectorizedSize)) {
-        ProxyRegOps.push_back(RetVal.getValue(J));
-      }
+      SDValue R =
+          DAG.getLoad(VecVT, dl, Call, Ptr,
+                      MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign);
 
-      Chain = RetVal.getValue(VectorizedSize);
-      InGlue = RetVal.getValue(VectorizedSize + 1);
+      LoadChains.push_back(R.getValue(1));
 
-      I += VectorizedSize;
+      if (NumElts == 1)
+        ProxyRegOps.push_back(R);
+      else
+        for (const unsigned J : llvm::seq(NumElts)) {
+          SDValue Elt = DAG.getNode(
+              LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
+                                : ISD::EXTRACT_VECTOR_ELT,
+              dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl));
+          ProxyRegOps.push_back(Elt);
+        }
+      I += NumElts;
     }
   }
 
-  Chain =
-      DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
-  InGlue = Chain.getValue(1);
+  const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
+  const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
+                                             UniqueCallSite + 1, SDValue(), dl);
 
   // Append ProxyReg instructions to the chain to make sure that `callseq_end`
   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
   // dangling.
-  for (const unsigned I : llvm::seq(ProxyRegOps.size())) {
-    if (I < RetElts.size() && RetElts[I]) {
-      InVals.push_back(RetElts[I]);
-      continue;
-    }
-
-    SDValue Ret =
-        DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(),
-                    {Chain, ProxyRegOps[I]});
-
-    const EVT ExpectedVT = Ins[I].VT;
-    if (!Ret.getValueType().bitsEq(ExpectedVT)) {
-      Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret);
-    }
+  for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
+    SDValue Proxy =
+        DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
+    SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
     InVals.push_back(Ret);
   }
 
-  for (SDValue &T : TempProxyRegOps) {
-    SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(),
-                               {Chain, T.getOperand(0)});
-    DAG.ReplaceAllUsesWith(T, Repl);
-    DAG.RemoveDeadNode(T.getNode());
-  }
-
-  // set isTailCall to false for now, until we figure out how to express
+  // set IsTailCall to false for now, until we figure out how to express
   // tail call optimization in PTX
-  isTailCall = false;
-  return Chain;
+  CLI.IsTailCall = false;
+  return CallEnd;
 }
 
 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
@@ -5117,10 +4934,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     Operands.push_back(DCI.DAG.getIntPtrConstant(
         cast<LoadSDNode>(LD)->getExtensionType(), DL));
     break;
-  case NVPTXISD::LoadParamV2:
-    OldNumOutputs = 2;
-    Opcode = NVPTXISD::LoadParamV4;
-    break;
   case NVPTXISD::LoadV2:
     OldNumOutputs = 2;
     Opcode = NVPTXISD::LoadV4;
@@ -5201,12 +5014,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
     MemVT = ST->getMemoryVT();
     Opcode = NVPTXISD::StoreV2;
     break;
-  case NVPTXISD::StoreParam:
-    Opcode = NVPTXISD::StoreParamV2;
-    break;
-  case NVPTXISD::StoreParamV2:
-    Opcode = NVPTXISD::StoreParamV4;
-    break;
   case NVPTXISD::StoreV2:
     MemVT = ST->getMemoryVT();
     Opcode = NVPTXISD::StoreV4;
@@ -5218,7 +5025,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
       return SDValue();
     Opcode = NVPTXISD::StoreV8;
     break;
-  case NVPTXISD::StoreParamV4:
   case NVPTXISD::StoreV8:
     // PTX doesn't support the next doubling of operands
     return SDValue();
@@ -5263,30 +5069,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
                                      MemVT, ST->getMemOperand());
 }
 
-static SDValue PerformStoreCombineHelper(SDNode *N,
-                                         TargetLowering::DAGCombinerInfo &DCI,
-                                         unsigned Front, unsigned Back) {
-  if (all_of(N->ops().drop_front(Front).drop_back(Back),
-             [](const SDUse &U) { return U.get()->isUndef(); }))
-    // Operand 0 is the previous value in the chain. Cannot return EntryToken
-    // as the previous value will become unused and eliminated later.
-    return N->getOperand(0);
-
-  return combinePackingMovIntoStore(N, DCI, Front, Back);
-}
-
 static SDValue PerformStoreCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
   return combinePackingMovIntoStore(N, DCI, 1, 2);
 }
 
-static SDValue PerformStoreParamCombine(SDNode *N,
-                                        TargetLowering::DAGCombinerInfo &DCI) {
-  // Operands from the 3rd to the 2nd last one are the values to be stored.
-  //   {Chain, ArgID, Offset, Val, Glue}
-  return PerformStoreCombineHelper(N, DCI, 3, 1);
-}
-
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
@@ -5942,6 +5729,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                            N->getConstantOperandAPInt(2),
                                            N->getConstantOperandVal(3)),
                                SDLoc(N), N->getValueType(0));
+  return SDValue();
+}
+
+// During call lowering we wrap the return values in a ProxyReg node which
+// depend on the chain value produced by the completed call. This ensures that
+// the full call is emitted in cases where libcalls are used to legalize
+// operations. To improve the functioning of other DAG combines we pull all
+// operations we can through one of these nodes, ensuring that the ProxyReg
+// directly wraps a load. That is:
+//
+//  (ProxyReg (zext (load retval0)))  =>  (zext (ProxyReg (load retval0)))
+//
+static SDValue sinkProxyReg(SDValue R, SDValue Chain,
+                            TargetLowering::DAGCombinerInfo &DCI) {
+  switch (R.getOpcode()) {
+  case ISD::TRUNCATE:
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::BITCAST: {
+    if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
+      return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
+    return SDValue();
+  }
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::OR: {
+    if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
+      if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
+        return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
+    return SDValue();
+  }
+  case ISD::Constant:
+    return R;
+  case ISD::LOAD:
+  case NVPTXISD::LoadV2:
+  case NVPTXISD::LoadV4: {
+    return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
+                           {Chain, R});
+  }
+  case ISD::BUILD_VECTOR: {
+    if (DCI.isBeforeLegalize())
+      return SDValue();
+
+    SmallVector<SDValue, 16> Ops;
+    for (auto &Op : R->ops()) {
+      SDValue V = sinkProxyReg(Op, Chain, DCI);
+      if (!V)
+        return SDValue();
+      Ops.push_back(V);
+    }
+    return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    if (DCI.isBeforeLegalize())
+      return SDValue();
+
+    if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
+      return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R),
+                             R.getValueType(), V, R.getOperand(1));
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+}
+
+static SDValue combineProxyReg(SDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Reg = N->getOperand(1);
+
+  // If the ProxyReg is not wrapping a load, try to pull the operations through
+  // the ProxyReg.
+  if (Reg.getOpcode() != ISD::LOAD) {
+    if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
+      return V;
+  }
 
   return SDValue();
 }
@@ -5965,7 +5832,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FADD:
     return PerformFADDCombine(N, DCI, OptLevel);
   case ISD::LOAD:
-  case NVPTXISD::LoadParamV2:
   case NVPTXISD::LoadV2:
   case NVPTXISD::LoadV4:
     return combineUnpackingMovIntoLoad(N, DCI);
@@ -5973,6 +5839,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformMULCombine(N, DCI, OptLevel);
   case NVPTXISD::PRMT:
     return combinePRMT(N, DCI, OptLevel);
+  case NVPTXISD::ProxyReg:
+    return combineProxyReg(N, DCI);
   case ISD::SETCC:
     return PerformSETCCCombine(N, DCI, STI.getSmVersion());
   case ISD::SHL:
@@ -5980,10 +5848,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SREM:
   case ISD::UREM:
     return PerformREMCombine(N, DCI, OptLevel);
-  case NVPTXISD::StoreParam:
-  case NVPTXISD::StoreParamV2:
-  case NVPTXISD::StoreParamV4:
-    return PerformStoreParamCombine(N, DCI);
   case ISD::STORE:
   case NVPTXISD::StoreV2:
   case NVPTXISD::StoreV4:
@@ -6332,6 +6196,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
   Results.push_back(NewValue.getValue(3));
 }
 
+static void replaceProxyReg(SDNode *N, SelectionDAG &DAG,
+                            const TargetLowering &TLI,
+                            SmallVectorImpl<SDValue> &Results) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Reg = N->getOperand(1);
+
+  MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
+
+  SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
+  SDValue NewProxy =
+      DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
+  SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
+
+  Results.push_back(Res);
+}
+
 void NVPTXTargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
@@ -6349,6 +6229,9 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   case ISD::CopyFromReg:
     ReplaceCopyFromReg_128(N, DAG, Results);
     return;
+  case NVPTXISD::ProxyReg:
+    replaceProxyReg(N, DAG, *this, Results);
+    return;
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 228e2aa..cf72a1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -38,7 +38,7 @@ enum NodeType : unsigned {
   /// This node represents a PTX call instruction. It's operands are as follows:
   ///
   /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
-  ///      NumParams, Callee, Proto, InGlue)
+  ///      NumParams, Callee, Proto)
   CALL,
 
   MoveParam,
@@ -84,13 +84,7 @@ enum NodeType : unsigned {
   StoreV2,
   StoreV4,
   StoreV8,
-  LoadParam,
-  LoadParamV2,
-  LoadParamV4,
-  StoreParam,
-  StoreParamV2,
-  StoreParamV4,
-  LAST_MEMORY_OPCODE = StoreParamV4,
+  LAST_MEMORY_OPCODE = StoreV8,
 };
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 442b900..86d6f7c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1757,12 +1757,6 @@ def SDTDeclareArrayParam :
   SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
 def SDTDeclareScalarParam :
   SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
-def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
-def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
 
 def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
@@ -1774,104 +1768,20 @@ def declare_array_param :
 def declare_scalar_param :
   SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-def LoadParam :
-  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
-         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2 :
-  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
-         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4 :
-  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
-         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def StoreParam :
-  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 :
-  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 :
-  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def MoveParam :
   SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
 def proxy_reg :
   SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>;
 
   /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
-  ///      NumParams, Callee, Proto, InGlue)
+  ///      NumParams, Callee, Proto)
 def SDTCallProfile : SDTypeProfile<0, 6,
                        [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>,
                         SDTCisVT<3, i32>, SDTCisVT<5, i32>]>;
-def call :
-  SDNode<"NVPTXISD::CALL", SDTCallProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-
-let mayLoad = true in {
-  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b),
-                  !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"),
-                  []>;
-
-  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b),
-                  !strconcat("ld.param.v2", opstr,
-                             " \t{{$dst, $dst2}}, [retval0$b];"), []>;
-
-  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
-                        regclass:$dst4),
-                  (ins Offseti32imm:$b),
-                  !strconcat("ld.param.v4", opstr,
-                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"),
-                  []>;
-}
-
-let mayStore = true in {
-
-  multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
-    foreach op = [IMMType, regclass] in
-      if !or(support_imm, !isa<NVPTXRegClass>(op)) then
-        def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
-          : NVPTXInst<(outs),
-                      (ins op:$val, i32imm:$a, Offseti32imm:$b),
-                      "st.param" # opstr # " \t[param$a$b], $val;",
-                      []>;
-  }
-
-  multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
-    foreach op1 = [IMMType, regclass] in
-      foreach op2 = [IMMType, regclass] in
-        def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
-              # !if(!isa<NVPTXRegClass>(op2), "r", "i")
-          : NVPTXInst<(outs),
-                      (ins op1:$val1, op2:$val2,
-                           i32imm:$a, Offseti32imm:$b),
-                      "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};",
-                      []>;
-  }
-
-  multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
-    foreach op1 = [IMMType, regclass] in
-      foreach op2 = [IMMType, regclass] in
-        foreach op3 = [IMMType, regclass] in
-          foreach op4 = [IMMType, regclass] in
-            def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
-                  # !if(!isa<NVPTXRegClass>(op2), "r", "i")
-                  # !if(!isa<NVPTXRegClass>(op3), "r", "i")
-                  # !if(!isa<NVPTXRegClass>(op4), "r", "i")
-
-              : NVPTXInst<(outs),
-                          (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
-                               i32imm:$a, Offseti32imm:$b),
-                          "st.param.v4" # opstr #
-                          " \t[param$a$b], {{$val1, $val2, $val3, $val4}};",
-                          []>;
-  }
-}
+def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>;
 
 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
-///      NumParams, Callee, Proto, InGlue)
+///      NumParams, Callee, Proto)
 
 def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; }
 
@@ -1908,43 +1818,6 @@ foreach is_convergent = [0, 1] in {
             (call_uni_inst $addr, imm:$rets, imm:$params)>;
 }
 
-def LoadParamMemI64    : LoadParamMemInst<B64, ".b64">;
-def LoadParamMemI32    : LoadParamMemInst<B32, ".b32">;
-def LoadParamMemI16    : LoadParamMemInst<B16, ".b16">;
-def LoadParamMemI8     : LoadParamMemInst<B16, ".b8">;
-def LoadParamMemV2I64  : LoadParamV2MemInst<B64, ".b64">;
-def LoadParamMemV2I32  : LoadParamV2MemInst<B32, ".b32">;
-def LoadParamMemV2I16  : LoadParamV2MemInst<B16, ".b16">;
-def LoadParamMemV2I8   : LoadParamV2MemInst<B16, ".b8">;
-def LoadParamMemV4I32  : LoadParamV4MemInst<B32, ".b32">;
-def LoadParamMemV4I16  : LoadParamV4MemInst<B16, ".b16">;
-def LoadParamMemV4I8   : LoadParamV4MemInst<B16, ".b8">;
-
-defm StoreParamI64    : StoreParamInst<B64, i64imm, ".b64">;
-defm StoreParamI32    : StoreParamInst<B32, i32imm, ".b32">;
-defm StoreParamI16    : StoreParamInst<B16, i16imm, ".b16">;
-defm StoreParamI8     : StoreParamInst<B16, i8imm,  ".b8">;
-
-defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>;
-defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>;
-
-defm StoreParamV2I64  : StoreParamV2Inst<B64, i64imm, ".b64">;
-defm StoreParamV2I32  : StoreParamV2Inst<B32, i32imm, ".b32">;
-defm StoreParamV2I16  : StoreParamV2Inst<B16, i16imm, ".b16">;
-defm StoreParamV2I8   : StoreParamV2Inst<B16, i8imm,  ".b8">;
-
-defm StoreParamV4I32  : StoreParamV4Inst<B32, i32imm, ".b32">;
-defm StoreParamV4I16  : StoreParamV4Inst<B16, i16imm, ".b16">;
-defm StoreParamV4I8   : StoreParamV4Inst<B16, i8imm,  ".b8">;
-
-defm StoreParamF32    : StoreParamInst<B32, f32imm, ".b32">;
-defm StoreParamF64    : StoreParamInst<B64, f64imm, ".b64">;
-
-defm StoreParamV2F32  : StoreParamV2Inst<B32, f32imm, ".b32">;
-defm StoreParamV2F64  : StoreParamV2Inst<B64, f64imm, ".b64">;
-
-defm StoreParamV4F32  : StoreParamV4Inst<B32, f32imm, ".b32">;
-
 def DECLARE_PARAM_array :
   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
             ".param .align $align .b8 \t$a[$size];", []>;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 5779d4e..0e8828f 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -243,8 +243,6 @@ public:
   createObjectTargetWriter() const override {
     return createPPCXCOFFObjectWriter(TT.isArch64Bit());
   }
-
-  std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 };
 
 } // end anonymous namespace
@@ -279,13 +277,6 @@ ELFPPCAsmBackend::getFixupKind(StringRef Name) const {
   return std::nullopt;
 }
 
-std::optional<MCFixupKind>
-XCOFFPPCAsmBackend::getFixupKind(StringRef Name) const {
-  return StringSwitch<std::optional<MCFixupKind>>(Name)
-      .Case("R_REF", PPC::fixup_ppc_nofixup)
-      .Default(std::nullopt);
-}
-
 MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
                                         const MCSubtargetInfo &STI,
                                         const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 9e8ee9f..df0c666 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -48,8 +48,7 @@ enum Fixups {
 
   /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
   /// TLS general and local dynamic models, or inserts the thread-pointer
-  /// register number. It can also be used to tie the ref symbol to prevent it
-  /// from being garbage collected on AIX.
+  /// register number.
   fixup_ppc_nofixup,
 
   /// A 16-bit fixup corresponding to lo16(_foo) with implied 3 zero bits for
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 8baf866..1af2f9c 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -220,8 +220,6 @@ bool PPCELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
   return evaluateAsRelocatable(Expr, Res, Asm);
 }
 
-void PPCXCOFFMCAsmInfo::anchor() {}
-
 PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
   if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle)
     report_fatal_error("XCOFF is not supported for little-endian targets");
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 0f945b3..6af1bd7 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -33,8 +33,6 @@ public:
 };
 
 class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
-  void anchor() override;
-
 public:
   explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
   void printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 54497d9..3dad0e8 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -213,7 +213,7 @@ public:
   void emitTCEntry(const MCSymbol &S, PPCMCExpr::Specifier Kind) override {
     if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
       MCSymbolXCOFF *TCSym =
-          cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly())
+          static_cast<const MCSectionXCOFF *>(Streamer.getCurrentSectionOnly())
               ->getQualNameSymbol();
       // On AIX, we have TLS variable offsets (symbol@({gd|ie|le|ld}) depending
       // on the TLS access method (or model). For the general-dynamic access
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index f75ab62..a04f404 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -56,6 +56,8 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
   switch ((unsigned)Fixup.getKind()) {
   default:
     report_fatal_error("Unimplemented fixup kind.");
+  case XCOFF::RelocationType::R_REF:
+    return {XCOFF::RelocationType::R_REF, 0};
   case PPC::fixup_ppc_half16: {
     const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15;
     switch (Specifier) {
@@ -96,12 +98,6 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25};
   case PPC::fixup_ppc_br24abs:
     return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
-  case PPC::fixup_ppc_nofixup: {
-    if (Specifier == PPC::S_None)
-      return {XCOFF::RelocationType::R_REF, 0};
-    else
-      llvm_unreachable("Unsupported Modifier");
-  } break;
   case FK_Data_4:
   case FK_Data_8:
     const uint8_t SignAndSizeForFKData =
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index a091b21..ce1d51a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2274,9 +2274,9 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
 
 void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   // Setup CurrentFnDescSym and its containing csect.
-  MCSectionXCOFF *FnDescSec =
-      cast<MCSectionXCOFF>(getObjFileLowering().getSectionForFunctionDescriptor(
-          &MF.getFunction(), TM));
+  auto *FnDescSec = static_cast<MCSectionXCOFF *>(
+      getObjFileLowering().getSectionForFunctionDescriptor(&MF.getFunction(),
+                                                           TM));
   FnDescSec->setAlignment(Align(Subtarget->isPPC64() ? 8 : 4));
 
   CurrentFnDescSym = FnDescSec->getQualNameSymbol();
@@ -2669,9 +2669,9 @@ void PPCAIXAsmPrinter::emitTracebackTable() {
     MCSymbol *EHInfoSym =
         TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF);
     MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym, TOCType_EHBlock);
-    const MCSymbol *TOCBaseSym =
-        cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
-            ->getQualNameSymbol();
+    const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+                                     getObjFileLowering().getTOCBaseSection())
+                                     ->getQualNameSymbol();
     const MCExpr *Exp =
         MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
                                 MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx);
@@ -2788,7 +2788,7 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
     }
   }
 
-  MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+  auto *Csect = static_cast<MCSectionXCOFF *>(
       getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
 
   // Switch to the containing csect.
@@ -2869,9 +2869,9 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() {
   OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
                          PointerSize);
   // Emit TOC base address.
-  const MCSymbol *TOCBaseSym =
-      cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
-          ->getQualNameSymbol();
+  const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+                                   getObjFileLowering().getTOCBaseSection())
+                                   ->getQualNameSymbol();
   OutStreamer->emitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
                          PointerSize);
   // Emit a null environment pointer.
@@ -2996,10 +2996,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
       Name += Prefix;
       Name += cast<MCSymbolXCOFF>(I.first.first)->getSymbolTableName();
       MCSymbol *S = OutContext.getOrCreateSymbol(Name);
-      TCEntry = cast<MCSectionXCOFF>(
+      TCEntry = static_cast<MCSectionXCOFF *>(
           getObjFileLowering().getSectionForTOCEntry(S, TM));
     } else {
-      TCEntry = cast<MCSectionXCOFF>(
+      TCEntry = static_cast<MCSectionXCOFF *>(
           getObjFileLowering().getSectionForTOCEntry(I.first.first, TM));
     }
     OutStreamer->switchSection(TCEntry);
@@ -3054,7 +3054,7 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
       return;
 
     SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM);
-    MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+    auto *Csect = static_cast<MCSectionXCOFF *>(
         getObjFileLowering().SectionForGlobal(GO, GOKind, TM));
 
     Align GOAlign = getGVAlignment(GO, GO->getDataLayout());
@@ -3316,9 +3316,9 @@ void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV,
       GlobalType = TOCType_GlobalExternal;
     MCSymbol *TypeInfoSym = TM.getSymbol(GV);
     MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym, GlobalType);
-    const MCSymbol *TOCBaseSym =
-        cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
-            ->getQualNameSymbol();
+    const MCSymbol *TOCBaseSym = static_cast<const MCSectionXCOFF *>(
+                                     getObjFileLowering().getTOCBaseSection())
+                                     ->getQualNameSymbol();
     auto &Ctx = OutStreamer->getContext();
     const MCExpr *Exp =
         MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx),
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index a143d85..d71c42c 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3849,9 +3849,14 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   switch (Inst.getOpcode()) {
   default:
     break;
-  case RISCV::PseudoC_ADDI_NOP:
-    emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+  case RISCV::PseudoC_ADDI_NOP: {
+    if (Inst.getOperand(2).getImm() == 0)
+      emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+    else
+      emitToStreamer(
+          Out, MCInstBuilder(RISCV::C_NOP_HINT).addOperand(Inst.getOperand(2)));
     return false;
+  }
   case RISCV::PseudoLLAImm:
   case RISCV::PseudoLAImm:
   case RISCV::PseudoLI: {
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index fa7bcfa..67cc01e 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -193,21 +193,19 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, uint32_t RegNo,
 static DecodeStatus DecodeGPRNoX0RegisterClass(MCInst &Inst, uint32_t RegNo,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
-  if (RegNo == 0) {
+  if (RegNo == 0)
     return MCDisassembler::Fail;
-  }
 
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static DecodeStatus
-DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, uint32_t Address,
-                             const MCDisassembler *Decoder) {
-  if (RegNo == 2) {
+static DecodeStatus DecodeGPRNoX2RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint32_t Address,
+                                               const MCDisassembler *Decoder) {
+  if (RegNo == 2)
     return MCDisassembler::Fail;
-  }
 
-  return DecodeGPRNoX0RegisterClass(Inst, RegNo, Address, Decoder);
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
 static DecodeStatus DecodeGPRNoX31RegisterClass(MCInst &Inst, uint32_t RegNo,
@@ -536,41 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder);
-
-static DecodeStatus
-decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
-static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder);
-
-static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder);
-
 static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address,
-                                    const MCDisassembler *Decoder);
+                                    const MCDisassembler *Decoder) {
+  bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE);
+  if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
                                         uint64_t Address,
+                                        const MCDisassembler *Decoder) {
+  if (Imm < RISCVZC::RA_S0)
+    return MCDisassembler::Fail;
+  return decodeZcmpRlist(Inst, Imm, Address, Decoder);
+}
+
+static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
+                                        uint64_t Address,
                                         const MCDisassembler *Decoder);
 
 static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
@@ -579,18 +562,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
 
 #include "RISCVGenDisassemblerTables.inc"
 
-static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn,
-                                               uint64_t Address,
-                                               const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-  uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
-  if (!Check(S, DecodeGPRNoX0RegisterClass(Inst, Rd, Address, Decoder)))
-    return MCDisassembler::Fail;
-  Inst.addOperand(Inst.getOperand(0));
-  Inst.addOperand(MCOperand::createImm(0));
-  return S;
-}
-
 static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
@@ -601,66 +572,6 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn,
-                                          uint64_t Address,
-                                          const MCDisassembler *Decoder) {
-  Inst.addOperand(MCOperand::createReg(RISCV::X0));
-  uint32_t Imm =
-      fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
-  [[maybe_unused]] DecodeStatus Result =
-      decodeSImmOperand<6>(Inst, Imm, Address, Decoder);
-  assert(Result == MCDisassembler::Success && "Invalid immediate");
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn,
-                                            uint64_t Address,
-                                            const MCDisassembler *Decoder) {
-  Inst.addOperand(MCOperand::createReg(RISCV::X0));
-  uint32_t Imm =
-      fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
-  return decodeCLUIImmOperand(Inst, Imm, Address, Decoder);
-}
-
-static DecodeStatus
-decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder) {
-  Inst.addOperand(MCOperand::createReg(RISCV::X0));
-  Inst.addOperand(Inst.getOperand(0));
-
-  uint32_t UImm6 =
-      fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5);
-  return decodeUImmLog2XLenNonZeroOperand(Inst, UImm6, Address, Decoder);
-}
-
-static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-  uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
-  uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5);
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder)))
-    return MCDisassembler::Fail;
-  return S;
-}
-
-static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-  uint32_t Rd = fieldFromInstruction(Insn, 7, 5);
-  uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5);
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder)))
-    return MCDisassembler::Fail;
-  Inst.addOperand(Inst.getOperand(0));
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder)))
-    return MCDisassembler::Fail;
-  return S;
-}
-
 static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder) {
@@ -691,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
   return S;
 }
 
-static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
-                                    uint64_t Address,
-                                    const MCDisassembler *Decoder) {
-  bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE);
-  if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2))
-    return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::createImm(Imm));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  if (Imm < RISCVZC::RA_S0)
-    return MCDisassembler::Fail;
-  return decodeZcmpRlist(Inst, Imm, Address, Decoder);
-}
-
 // Add implied SP operand for C.*SP compressed instructions. The SP operand
 // isn't explicitly encoded in the instruction.
 void RISCVDisassembler::addSPOperands(MCInst &MI) const {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 2c37c3b..82e3b5c 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -320,6 +320,7 @@ bool RISCVAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) {
   MCFixup Fixup =
       MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_RISCV_ALIGN);
   F.setVarFixups({Fixup});
+  F.setLinkerRelaxable();
   F.getParent()->setLinkerRelaxable();
   return true;
 }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 7ad5d5f..bddea43 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -330,7 +330,6 @@ enum OperandType : unsigned {
   OPERAND_UIMM32,
   OPERAND_UIMM48,
   OPERAND_UIMM64,
-  OPERAND_ZERO,
   OPERAND_THREE,
   OPERAND_FOUR,
   OPERAND_SIMM5,
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index d4f5d8f..2f32e2a 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -293,7 +293,7 @@ void RISCVAsmPrinter::emitNTLHint(const MachineInstr *MI) {
 
   MCInst Hint;
   if (STI->hasStdExtZca())
-    Hint.setOpcode(RISCV::C_ADD_HINT);
+    Hint.setOpcode(RISCV::C_ADD);
   else
     Hint.setOpcode(RISCV::ADD);
 
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index b1ab76a..9fc0d81 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1581,7 +1581,8 @@ void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
     // Set the register and all its subregisters.
     if (!MRI.def_empty(CSReg) || MRI.getUsedPhysRegsMask().test(CSReg)) {
       SavedRegs.set(CSReg);
-      llvm::for_each(SubRegs, [&](unsigned Reg) { return SavedRegs.set(Reg); });
+      for (unsigned Reg : SubRegs)
+        SavedRegs.set(Reg);
     }
 
     // Combine to super register if all of its subregisters are marked.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 54845e5..607edd3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
   }
 }
 
+bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV(
+    EVT ScalarTy) const {
+  if (!ScalarTy.isSimple())
+    return false;
+  switch (ScalarTy.getSimpleVT().SimpleTy) {
+  case MVT::iPTR:
+    return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::f16:
+  case MVT::bf16:
+  case MVT::f32:
+    return true;
+  case MVT::i64:
+  case MVT::f64:
+    return Subtarget.hasVInstructionsI64();
+  default:
+    return false;
+  }
+}
 
 unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const {
   return NumRepeatedDivisors;
@@ -24239,7 +24260,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
     return false;
 
   EVT ScalarType = DataType.getScalarType();
-  if (!isLegalElementTypeForRVV(ScalarType))
+  if (!isLegalLoadStoreElementTypeForRVV(ScalarType))
     return false;
 
   if (!Subtarget.enableUnalignedVectorMem() &&
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ca70c46..a788c0b7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -384,6 +384,7 @@ public:
   bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override;
 
   bool isLegalElementTypeForRVV(EVT ScalarTy) const;
+  bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const;
 
   bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 64f9e3e..085064e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2859,9 +2859,6 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         case RISCVOp::OPERAND_UIMM16_NONZERO:
           Ok = isUInt<16>(Imm) && (Imm != 0);
           break;
-        case RISCVOp::OPERAND_ZERO:
-          Ok = Imm == 0;
-          break;
         case RISCVOp::OPERAND_THREE:
           Ok = Imm == 3;
           break;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 8252a9b..c5551fb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -57,12 +57,6 @@ def simm6nonzero : RISCVOp,
   }];
 }
 
-def immzero : RISCVOp,
-              ImmLeaf<XLenVT, [{return (Imm == 0);}]> {
-  let ParserMatchClass = ImmZeroAsmOperand;
-  let OperandType = "OPERAND_ZERO";
-}
-
 def CLUIImmAsmOperand : AsmOperandClass {
   let Name = "CLUIImm";
   let RenderMethod = "addImmOperands";
@@ -272,7 +266,7 @@ class Bcz<bits<3> funct3, string OpcodeStr>
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class Shift_right<bits<2> funct2, string OpcodeStr>
     : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
-                 (ins GPRC:$rs1, uimmlog2xlennonzero:$imm),
+                 (ins GPRC:$rs1, uimmlog2xlen:$imm),
                  OpcodeStr, "$rs1, $imm"> {
   let Constraints = "$rs1 = $rd";
   let Inst{12} = imm{5};
@@ -402,17 +396,19 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">,
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
-                        (ins GPRNoX0:$rd, simm6nonzero:$imm),
+                        (ins GPRNoX0:$rd, simm6:$imm),
                         "c.addi", "$rd, $imm">,
              Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
 }
 
-// Alternate syntax for c.nop. Converted to C_NOP by the assembler.
+// Alternate syntax for c.nop. Converted to C_NOP/C_NOP_HINT by the assembler.
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in
-def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm),
-                              [], "c.addi", "$rd, $imm">;
+def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, simm6:$imm),
+                              [], "c.addi", "$rd, $imm"> {
+  let Constraints = "$rs1 = $rd";
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
     DecoderNamespace = "RV32Only", Defs = [X1],
@@ -430,7 +426,7 @@ def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb),
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm),
+def C_LI : RVInst16CI<0b010, 0b01, (outs GPR:$rd), (ins simm6:$imm),
                       "c.li", "$rd, $imm">,
            Sched<[WriteIALU]>;
 
@@ -449,7 +445,7 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd),
+def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX2:$rd),
                        (ins c_lui_imm:$imm),
                        "c.lui", "$rd, $imm">,
             Sched<[WriteIALU]>;
@@ -497,8 +493,8 @@ def C_BEQZ : Bcz<0b110, "c.beqz">, Sched<[WriteJmp, ReadJmp]>;
 def C_BNEZ : Bcz<0b111, "c.bnez">, Sched<[WriteJmp, ReadJmp]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
-                        (ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm),
+def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb),
+                        (ins GPR:$rd, uimmlog2xlen:$imm),
                         "c.slli", "$rd, $imm">,
              Sched<[WriteShiftImm, ReadShiftImm]> {
   let Constraints = "$rd = $rd_wb";
@@ -544,7 +540,7 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1),
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isMoveReg = 1,
     isAsCheapAsAMove = 1 in
-def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2),
+def C_MV : RVInst16CR<0b1000, 0b10, (outs GPR:$rs1), (ins GPRNoX0:$rs2),
                       "c.mv", "$rs1, $rs2">,
            Sched<[WriteIALU, ReadIALU]>;
 
@@ -557,8 +553,8 @@ def C_JALR : RVInst16CR<0b1001, 0b10, (outs), (ins GPRNoX0:$rs1),
                         "c.jalr", "$rs1">, Sched<[WriteJalr, ReadJalr]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rd),
-                       (ins GPRNoX0:$rs1, GPRNoX0:$rs2),
+def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPR:$rd),
+                       (ins GPR:$rs1, GPRNoX0:$rs2),
                        "c.add", "$rs1, $rs2">,
             Sched<[WriteIALU, ReadIALU, ReadIALU]> {
   let Constraints = "$rs1 = $rd";
@@ -616,81 +612,6 @@ def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
   let rd = 0;
 }
 
-def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
-                                      (ins GPRNoX0:$rd, immzero:$imm),
-                                      "c.addi", "$rd, $imm">,
-                           Sched<[WriteIALU, ReadIALU]> {
-  let Constraints = "$rd = $rd_wb";
-  let imm = 0;
-  let DecoderMethod = "decodeRVCInstrRdRs1ImmZero";
-}
-
-def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
-                           "c.li", "$rd, $imm">,
-                Sched<[WriteIALU]> {
-  let Inst{11-7} = 0;
-  let DecoderMethod = "decodeRVCInstrRdSImm6";
-}
-
-def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd),
-                            (ins c_lui_imm:$imm),
-                            "c.lui", "$rd, $imm">,
-                 Sched<[WriteIALU]> {
-  let Inst{11-7} = 0;
-  let DecoderMethod = "decodeRVCInstrRdCLUIImm";
-}
-
-def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2),
-                           "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]> {
-  let Inst{11-7} = 0;
-  let DecoderMethod = "decodeRVCInstrRdRs2";
-}
-
-def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rd),
-                            (ins GPRX0:$rs1, GPRNoX0:$rs2),
-                            "c.add", "$rs1, $rs2">,
-                 Sched<[WriteIALU, ReadIALU, ReadIALU]> {
-  let Constraints = "$rs1 = $rd";
-  let Inst{11-7} = 0;
-  let DecoderMethod = "decodeRVCInstrRdRs1Rs2";
-}
-
-def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
-                             (ins GPRX0:$rd, uimmlog2xlennonzero:$imm),
-                             "c.slli", "$rd, $imm">,
-                  Sched<[WriteShiftImm, ReadShiftImm]> {
-  let Constraints = "$rd = $rd_wb";
-  let Inst{11-7} = 0;
-  let DecoderMethod = "decodeRVCInstrRdRs1UImmLog2XLenNonZero";
-}
-
-def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
-                               "c.slli64", "$rd">,
-                    Sched<[WriteShiftImm, ReadShiftImm]> {
-  let Constraints = "$rd = $rd_wb";
-  let imm = 0;
-}
-
-def C_SRLI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
-                               (ins GPRC:$rs1),
-                               "c.srli64", "$rs1">,
-                    Sched<[WriteShiftImm, ReadShiftImm]> {
-  let Constraints = "$rs1 = $rd";
-  let Inst{6-2} = 0;
-  let Inst{11-10} = 0b00;
-  let Inst{12} = 0;
-}
-
-def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
-                               (ins GPRC:$rs1),
-                               "c.srai64", "$rs1">,
-                    Sched<[WriteShiftImm, ReadShiftImm]> {
-  let Constraints = "$rs1 = $rd";
-  let Inst{6-2} = 0;
-  let Inst{11-10} = 0b01;
-  let Inst{12} = 0;
-}
-
 } // Predicates = [HasStdExtZca], hasSideEffects = 0, mayLoad = 0,
   // mayStore = 0
 
@@ -699,15 +620,17 @@ def C_SRAI64_HINT : RVInst16CB<0b100, 0b01, (outs GPRC:$rd),
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtZca] in {
-// Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6.
-def : InstAlias<"c.addi x0, $imm", (C_NOP_HINT simm6nonzero:$imm), 0>;
+// Legacy aliases.
+def : InstAlias<"c.slli64 $rd", (C_SLLI GPR:$rd, 0), 0>;
+def : InstAlias<"c.srli64 $rs1", (C_SRLI GPRC:$rs1, 0), 0>;
+def : InstAlias<"c.srai64 $rs1", (C_SRAI GPRC:$rs1, 0), 0>;
 }
 
 let Predicates = [HasStdExtC, HasStdExtZihintntl] in {
-def : InstAlias<"c.ntl.p1", (C_ADD_HINT X0, X2)>;
-def : InstAlias<"c.ntl.pall", (C_ADD_HINT X0, X3)>;
-def : InstAlias<"c.ntl.s1", (C_ADD_HINT X0, X4)>;
-def : InstAlias<"c.ntl.all", (C_ADD_HINT X0, X5)>;
+def : InstAlias<"c.ntl.p1", (C_ADD X0, X2)>;
+def : InstAlias<"c.ntl.pall", (C_ADD X0, X3)>;
+def : InstAlias<"c.ntl.s1", (C_ADD X0, X4)>;
+def : InstAlias<"c.ntl.all", (C_ADD X0, X5)>;
 } // Predicates = [HasStdExtC, HasStdExtZihintntl]
 
 let EmitPriority = 0 in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index dd365cf..8297d50 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -136,6 +136,7 @@ class RVPUnary_ri<bits<2> w, bits<5> uf, string opcodestr>
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtP] in {
+let IsSignExtendingOpW = 1 in
 def CLS    : Unary_r<0b011000000011, 0b001, "cls">;
 def ABS    : Unary_r<0b011000000111, 0b001, "abs">;
 } // Predicates = [HasStdExtP]
@@ -146,8 +147,10 @@ let Predicates = [HasStdExtP, IsRV64] in {
 def REV16      : Unary_r<0b011010110000, 0b101, "rev16">;
 def REV_RV64   : Unary_r<0b011010111111, 0b101, "rev">;
 
+let IsSignExtendingOpW = 1 in {
 def CLSW  : UnaryW_r<0b011000000011, 0b001, "clsw">;
 def ABSW  : UnaryW_r<0b011000000111, 0b001, "absw">;
+}
 } // Predicates = [HasStdExtP, IsRV64]
 
 let Predicates = [HasStdExtP] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 30d8f85..3cbe668 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
   if (!isTypeLegal(VT))
     return false;
 
-  if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
+  if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) ||
       !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
                                       Alignment))
     return false;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index e87f452..ccb39e8 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -268,6 +268,11 @@ def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)> {
   let DiagnosticString = "register must be a GPR excluding zero (x0)";
 }
 
+def GPRNoX2 : GPRRegisterClass<(sub GPR, X2)> {
+  let DiagnosticType = "InvalidRegClassGPRNoX2";
+  let DiagnosticString = "register must be a GPR excluding sp (x2)";
+}
+
 def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)> {
   let DiagnosticType = "InvalidRegClassGPRNoX0X2";
   let DiagnosticString = "register must be a GPR excluding zero (x0) and sp (x2)";
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index d62d99c..f0510ec 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -265,7 +265,7 @@ public:
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    return TLI->isLegalElementTypeForRVV(ElemType);
+    return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment,
@@ -297,7 +297,7 @@ public:
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    return TLI->isLegalElementTypeForRVV(ElemType);
+    return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) const override {
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
index 3ef6030..72bb372 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
@@ -69,8 +69,8 @@ void SystemZHLASMAsmStreamer::EmitEOL() {
 
 void SystemZHLASMAsmStreamer::changeSection(MCSection *Section,
                                             uint32_t Subsection) {
-  Section->printSwitchToSection(*MAI, getContext().getTargetTriple(), OS,
-                                Subsection);
+  MAI->printSwitchToSection(*Section, Subsection,
+                            getContext().getTargetTriple(), OS);
   MCStreamer::changeSection(Section, Subsection);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 19c9e9c..6ae69a4 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -900,7 +900,8 @@ public:
 
   bool checkDataSection() {
     if (CurrentState != DataSection) {
-      auto *WS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+      auto *WS = static_cast<const MCSectionWasm *>(
+          getStreamer().getCurrentSectionOnly());
       if (WS && WS->isText())
         return error("data directive must occur in a data segment: ",
                      Lexer.getTok());
@@ -1218,7 +1219,8 @@ public:
 
   void doBeforeLabelEmit(MCSymbol *Symbol, SMLoc IDLoc) override {
     // Code below only applies to labels in text sections.
-    auto *CWS = cast<MCSectionWasm>(getStreamer().getCurrentSectionOnly());
+    auto *CWS = static_cast<const MCSectionWasm *>(
+        getStreamer().getCurrentSectionOnly());
     if (!CWS->isText())
       return;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index 13603f8..a606209 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -71,6 +71,7 @@ def FeatureReferenceTypes :
       SubtargetFeature<"reference-types", "HasReferenceTypes", "true",
                        "Enable reference types">;
 
+def FeatureGC : SubtargetFeature<"gc", "HasGC", "true", "Enable wasm gc">;
 def FeatureRelaxedSIMD :
       SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
                        "Enable relaxed-simd instructions">;
@@ -136,13 +137,13 @@ def : ProcessorModel<"lime1", NoSchedModel,
 
 // Latest and greatest experimental version of WebAssembly. Bugs included!
 def : ProcessorModel<"bleeding-edge", NoSchedModel,
-                      [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
-                       FeatureCallIndirectOverlong, FeatureExceptionHandling,
-                       FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
-                       FeatureMultivalue, FeatureMutableGlobals,
-                       FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
-                       FeatureReferenceTypes, FeatureSIMD128, FeatureSignExt,
-                       FeatureTailCall]>;
+                     [FeatureAtomics, FeatureBulkMemory, FeatureBulkMemoryOpt,
+                      FeatureCallIndirectOverlong, FeatureExceptionHandling,
+                      FeatureExtendedConst, FeatureFP16, FeatureMultiMemory,
+                      FeatureMultivalue, FeatureMutableGlobals,
+                      FeatureNontrappingFPToInt, FeatureRelaxedSIMD,
+                      FeatureReferenceTypes, FeatureGC, FeatureSIMD128,
+                      FeatureSignExt, FeatureTailCall]>;
 
 //===----------------------------------------------------------------------===//
 // Target Declaration
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 11936a3..3f80b2a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -288,7 +288,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
     // Expand float operations supported for scalars but not SIMD
     for (auto Op : {ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
-                    ISD::FEXP, ISD::FEXP2})
+                    ISD::FEXP, ISD::FEXP2, ISD::FEXP10})
       for (auto T : {MVT::v4f32, MVT::v2f64})
         setOperationAction(Op, T, Expand);
 
@@ -3436,8 +3436,7 @@ static SDValue performSETCCCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
-  assert(N->getOpcode() == ISD::MUL);
+static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   if (VT != MVT::v8i32 && VT != MVT::v16i32)
     return SDValue();
@@ -3523,6 +3522,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue performMulCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N->getOpcode() == ISD::MUL);
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
+    return Res;
+
+  // We don't natively support v16i8 mul, but we do support v8i16 so split the
+  // inputs and extend them to v8i16. Only do this before legalization in case
+  // a narrow vector is widened and may be simplified later.
+  if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+    return SDValue();
+
+  SDLoc DL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue LowLHS =
+      DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
+  SDValue HighLHS =
+      DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
+  SDValue LowRHS =
+      DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
+  SDValue HighRHS =
+      DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
+
+  SDValue MulLow =
+      DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
+  SDValue MulHigh = DAG.getBitcast(
+      VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
+
+  // Take the low byte of each lane.
+  return DAG.getVectorShuffle(
+      VT, DL, MulLow, MulHigh,
+      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -3557,6 +3596,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return performLowerPartialReduction(N, DCI.DAG);
   }
   case ISD::MUL:
-    return performMulCombine(N, DCI.DAG);
+    return performMulCombine(N, DCI);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index b5e723e..2b632fd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -76,6 +76,9 @@ def HasReferenceTypes :
     Predicate<"Subtarget->hasReferenceTypes()">,
     AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
 
+def HasGC : Predicate<"Subtarget->hasGC()">,
+            AssemblerPredicate<(all_of FeatureGC), "gc">;
+
 def HasRelaxedSIMD :
     Predicate<"Subtarget->hasRelaxedSIMD()">,
     AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index 40b87a0..fc82e5b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -36,13 +36,10 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> {
         Requires<[HasReferenceTypes]>;
 }
 
-defm REF_TEST_FUNCREF :
-  I<(outs I32: $res),
-    (ins TypeIndex:$type, FUNCREF: $ref),
-    (outs),
-    (ins TypeIndex:$type),
-    [],
-    "ref.test\t$type, $ref", "ref.test $type", 0xfb14>;
+defm REF_TEST_FUNCREF : I<(outs I32:$res), (ins TypeIndex:$type, FUNCREF:$ref),
+                          (outs), (ins TypeIndex:$type), [],
+                          "ref.test\t$type, $ref", "ref.test $type", 0xfb14>,
+                        Requires<[HasGC]>;
 
 defm "" : REF_I<FUNCREF, funcref, "func">;
 defm "" : REF_I<EXTERNREF, externref, "extern">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index d13862f..143298b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
   def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
              (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
 
+  def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
+             (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
 }
 
 defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 28f6599..c3990d1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -782,6 +782,24 @@ void WebAssemblyLowerEmscriptenEHSjLj::rebuildSSA(Function &F) {
     for (Instruction &I : BB) {
       if (I.getType()->isVoidTy())
         continue;
+
+      if (isa<AllocaInst>(&I)) {
+        // If the alloca has any lifetime marker that is no longer dominated
+        // by the alloca, remove all lifetime markers. Lifetime markers must
+        // always work directly on the alloca, and this is no longer possible.
+        bool HasNonDominatedLifetimeMarker = any_of(I.users(), [&](User *U) {
+          auto *UserI = cast<Instruction>(U);
+          return UserI->isLifetimeStartOrEnd() && !DT.dominates(&I, UserI);
+        });
+        if (HasNonDominatedLifetimeMarker) {
+          for (User *U : make_early_inc_range(I.users())) {
+            auto *UserI = cast<Instruction>(U);
+            if (UserI->isLifetimeStartOrEnd())
+              UserI->eraseFromParent();
+          }
+        }
+      }
+
       unsigned VarID = SSA.AddVariable(I.getName(), I.getType());
       // If a value is defined by an invoke instruction, it is only available in
       // its normal destination and not in its unwind destination.
@@ -1269,10 +1287,20 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
 
   // Setjmp preparation
 
+  SmallVector<AllocaInst *> StaticAllocas;
+  for (Instruction &I : F.getEntryBlock())
+    if (auto *AI = dyn_cast<AllocaInst>(&I))
+      if (AI->isStaticAlloca())
+        StaticAllocas.push_back(AI);
+
   BasicBlock *Entry = &F.getEntryBlock();
   DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
   SplitBlock(Entry, &*Entry->getFirstInsertionPt());
 
+  // Move static allocas back into the entry block, so they stay static.
+  for (AllocaInst *AI : StaticAllocas)
+    AI->moveBefore(Entry->getTerminator()->getIterator());
+
   IRB.SetInsertPoint(Entry->getTerminator()->getIterator());
   // This alloca'ed pointer is used by the runtime to identify function
   // invocations. It's just for pointer comparisons. It will never be
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 7912aeb..ffd135d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -63,8 +63,10 @@ void OptimizeReturned::visitCallBase(CallBase &CB) {
       if (isa<Constant>(Arg))
         continue;
       // Like replaceDominatedUsesWith but using Instruction/Use dominance.
-      Arg->replaceUsesWithIf(&CB,
-                             [&](Use &U) { return DT->dominates(&CB, U); });
+      Arg->replaceUsesWithIf(&CB, [&](Use &U) {
+        auto *I = cast<Instruction>(U.getUser());
+        return !I->isLifetimeStartOrEnd() && DT->dominates(&CB, U);
+      });
     }
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 40ea48a..a3ce40f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -43,6 +43,11 @@ WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU,
     Bits.set(WebAssembly::FeatureBulkMemoryOpt);
   }
 
+  // gc implies reference-types
+  if (HasGC) {
+    HasReferenceTypes = true;
+  }
+
   // reference-types implies call-indirect-overlong
   if (HasReferenceTypes) {
     HasCallIndirectOverlong = true;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 591ce256..f814274 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -51,6 +51,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   bool HasMutableGlobals = false;
   bool HasNontrappingFPToInt = false;
   bool HasReferenceTypes = false;
+  bool HasGC = false;
   bool HasSignExt = false;
   bool HasTailCall = false;
   bool HasWideArithmetic = false;
@@ -107,6 +108,7 @@ public:
   bool hasMutableGlobals() const { return HasMutableGlobals; }
   bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
   bool hasReferenceTypes() const { return HasReferenceTypes; }
+  bool hasGC() const { return HasGC; }
   bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
   bool hasSignExt() const { return HasSignExt; }
   bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 8213e51..d7671ed 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -4803,7 +4803,7 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
     getStreamer().initSections(false, getSTI());
     Section = getStreamer().getCurrentSectionOnly();
   }
-  if (Section->useCodeAlign())
+  if (getContext().getAsmInfo()->useCodeAlign(*Section))
     getStreamer().emitCodeAlignment(Align(2), &getSTI(), 0);
   else
     getStreamer().emitValueToAlignment(Align(2), 0, 1, 0);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index e213923..7f9d474 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -388,36 +388,6 @@ static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) {
   return false;
 }
 
-/// Check if the instruction to be emitted is right after any data.
-static bool
-isRightAfterData(MCFragment *CurrentFragment,
-                 const std::pair<MCFragment *, size_t> &PrevInstPosition) {
-  MCFragment *F = CurrentFragment;
-  // Since data is always emitted into a DataFragment, our check strategy is
-  // simple here.
-  //   - If the fragment is a DataFragment
-  //     - If it's empty (section start or data after align), return false.
-  //     - If it's not the fragment where the previous instruction is,
-  //       returns true.
-  //     - If it's the fragment holding the previous instruction but its
-  //       size changed since the previous instruction was emitted into
-  //       it, returns true.
-  //     - Otherwise returns false.
-  //   - If the fragment is not a DataFragment, returns false.
-  if (F->getKind() == MCFragment::FT_Data)
-    return F->getFixedSize() && (F != PrevInstPosition.first ||
-                                 F->getFixedSize() != PrevInstPosition.second);
-
-  return false;
-}
-
-/// \returns the fragment size if it has instructions, otherwise returns 0.
-static size_t getSizeForInstFragment(const MCFragment *F) {
-  if (!F || !F->hasInstructions())
-    return 0;
-  return F->getSize();
-}
-
 /// Return true if we can insert NOP or prefixes automatically before the
 /// the instruction to be emitted.
 bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
@@ -441,9 +411,11 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
     // semantic.
     return false;
 
-  if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
-    // If this instruction follows any data, there is no clear
-    // instruction boundary, inserting a nop/prefix would change semantic.
+  // If this instruction follows any data, there is no clear instruction
+  // boundary, inserting a nop/prefix would change semantic.
+  auto Offset = OS.getCurFragSize();
+  if (Offset && (OS.getCurrentFragment() != PrevInstPosition.first ||
+                 Offset != PrevInstPosition.second))
     return false;
 
   return true;
@@ -552,7 +524,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
   // Update PrevInstOpcode here, canPadInst() reads that.
   MCFragment *CF = OS.getCurrentFragment();
   PrevInstOpcode = Inst.getOpcode();
-  PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+  PrevInstPosition = std::make_pair(CF, OS.getCurFragSize());
 
   if (!canPadBranches(OS))
     return;
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index 620526ff..3f2a433 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -12,8 +12,52 @@
 
 // NOTE: NO INCLUDE GUARD DESIRED!
 
+#ifndef DUMMY_FUNCTION_PASS
+#define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
+#endif
+DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
+DUMMY_FUNCTION_PASS("lower-amx-type", X86LowerAMXTypePass(*this))
+DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
+DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
+#undef DUMMY_FUNCTION_PASS
+
 #ifndef MACHINE_FUNCTION_PASS
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
 #undef MACHINE_FUNCTION_PASS
+
+#ifndef DUMMY_MACHINE_FUNCTION_PASS
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME)
+#endif
+DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-SFB", X86AvoidSFBPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-avoid-trailing-call", X86AvoidTrailingCallPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-cf-opt", X86CallFrameOptimization())
+DUMMY_MACHINE_FUNCTION_PASS("x86-cmov-conversion", X86CmovConverterPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-codege", FPS())
+DUMMY_MACHINE_FUNCTION_PASS("x86-compress-evex", CompressEVEXPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-domain-reassignment", X86DomainReassignment())
+DUMMY_MACHINE_FUNCTION_PASS("x86-dyn-alloca-expander", X86DynAllocaExpander())
+DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix())
+DUMMY_MACHINE_FUNCTION_PASS("fastpretileconfig", X86FastPreTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("fasttileconfig", X86FastTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-LEAs", FixupLEAPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-bw-inst", FixupBWInstPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-inst-tuning", X86FixupInstTuningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-setcc", X86FixupSetCCPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-fixup-vector-constants", X86FixupVectorConstantsPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-flags-copy-lowering", X86FlagsCopyLoweringPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lower-tile-copy", X86LowerTileCopy())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-load", X86LoadValueInjectionLoadHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-lvi-ret", X86LoadValueInjectionRetHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-optimize-LEAs", X86OptimizeLEAPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-pseudo", X86ExpandPseudo())
+DUMMY_MACHINE_FUNCTION_PASS("x86-return-thunks", X86ReturnThunks())
+DUMMY_MACHINE_FUNCTION_PASS("x86-seses", X86SpeculativeExecutionSideEffectSuppression())
+DUMMY_MACHINE_FUNCTION_PASS("x86-slh", X86SpeculativeLoadHardeningPass())
+DUMMY_MACHINE_FUNCTION_PASS("x86-suppress-apx-for-relocation", X86SuppressAPXForRelocationPass())
+DUMMY_MACHINE_FUNCTION_PASS("tile-pre-config", X86PreTileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("tileconfig", X86TileConfig())
+DUMMY_MACHINE_FUNCTION_PASS("x86-wineh-unwindv2", X86WinEHUnwindV2())
+DUMMY_MACHINE_FUNCTION_PASS("x86argumentstackrebase", X86ArgumentStackSlotPass())
+#undef DUMMY_MACHINE_FUNCTION_PASS
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 37a7b37..90791fc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1838,14 +1838,15 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return LT.first * *KindCost;
 
   static const CostKindTblEntry AVX512BWShuffleTbl[] = {
-    { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
-    { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
-    { TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
+    { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v64i8,  { 1, 3, 1, 1 } }, // vpbroadcastb
 
-    { TTI::SK_Reverse, MVT::v32i16,   { 2, 2, 2, 2 } }, // vpermw
-    { TTI::SK_Reverse, MVT::v32f16,   { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_Reverse, MVT::v32i16,   { 2, 6, 2, 4 } }, // vpermw
+    { TTI::SK_Reverse, MVT::v32f16,   { 2, 6, 2, 4 } }, // vpermw
     { TTI::SK_Reverse, MVT::v16i16,   { 2, 2, 2, 2 } }, // vpermw
-    { TTI::SK_Reverse, MVT::v64i8,    { 2, 2, 2, 2 } }, // pshufb + vshufi64x2
+    { TTI::SK_Reverse, MVT::v16f16,   { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_Reverse, MVT::v64i8,    { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
 
     { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
     { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
@@ -1874,18 +1875,25 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return LT.first * *KindCost;
 
   static const CostKindTblEntry AVX512ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v8f64,  { 1, 1, 1, 1 } }, // vbroadcastsd
-      {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
-      {TTI::SK_Broadcast, MVT::v8i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
-      {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
-      {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
-
-      {TTI::SK_Reverse, MVT::v8f64,  { 1, 3, 1, 1 } }, // vpermpd
-      {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
-      {TTI::SK_Reverse, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpermq
-      {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
+      {TTI::SK_Broadcast, MVT::v8f64,  { 1, 3, 1, 1 } }, // vbroadcastsd
+      {TTI::SK_Broadcast, MVT::v4f64,  { 1, 3, 1, 1 } }, // vbroadcastsd
+      {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
+      {TTI::SK_Broadcast, MVT::v8f32,  { 1, 3, 1, 1 } }, // vbroadcastss
+      {TTI::SK_Broadcast, MVT::v8i64,  { 1, 3, 1, 1 } }, // vpbroadcastq
+      {TTI::SK_Broadcast, MVT::v4i64,  { 1, 3, 1, 1 } }, // vpbroadcastq
+      {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
+      {TTI::SK_Broadcast, MVT::v8i32,  { 1, 3, 1, 1 } }, // vpbroadcastd
+      {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v64i8,  { 1, 3, 1, 1 } }, // vpbroadcastb
+      {TTI::SK_Broadcast, MVT::v32i8,  { 1, 3, 1, 1 }},  // vpbroadcastb
+
+      {TTI::SK_Reverse, MVT::v8f64,  { 1, 5, 2, 3 } }, // vpermpd
+      {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
+      {TTI::SK_Reverse, MVT::v8i64,  { 1, 5, 2, 3 } }, // vpermq
+      {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
       {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
       {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
       {TTI::SK_Reverse, MVT::v64i8,  { 7, 7, 7, 7 } }, // per mca
@@ -1973,21 +1981,24 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return LT.first * *KindCost;
 
   static const CostKindTblEntry AVX2ShuffleTbl[] = {
-    { TTI::SK_Broadcast, MVT::v4f64,  { 1, 1, 1, 1 } }, // vbroadcastpd
-    { TTI::SK_Broadcast, MVT::v8f32,  { 1, 1, 1, 1 } }, // vbroadcastps
-    { TTI::SK_Broadcast, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
-    { TTI::SK_Broadcast, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpbroadcastd
-    { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw
-    { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw
-    { TTI::SK_Broadcast, MVT::v32i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
-
-    { TTI::SK_Reverse, MVT::v4f64,    { 1, 1, 1, 1 } }, // vpermpd
-    { TTI::SK_Reverse, MVT::v8f32,    { 1, 1, 1, 1 } }, // vpermps
-    { TTI::SK_Reverse, MVT::v4i64,    { 1, 1, 1, 1 } }, // vpermq
-    { TTI::SK_Reverse, MVT::v8i32,    { 1, 1, 1, 1 } }, // vpermd
-    { TTI::SK_Reverse, MVT::v16i16,   { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
-    { TTI::SK_Reverse, MVT::v16f16,   { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
-    { TTI::SK_Reverse, MVT::v32i8,    { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
+    { TTI::SK_Broadcast, MVT::v4f64,  { 1, 3, 1, 2 } }, // vbroadcastpd
+    { TTI::SK_Broadcast, MVT::v8f32,  { 1, 3, 1, 2 } }, // vbroadcastps
+    { TTI::SK_Broadcast, MVT::v4i64,  { 1, 3, 1, 2 } }, // vpbroadcastq
+    { TTI::SK_Broadcast, MVT::v8i32,  { 1, 3, 1, 2 } }, // vpbroadcastd
+    { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v8i16,  { 1, 3, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v8f16,  { 1, 3, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v32i8,  { 1, 3, 1, 2 } }, // vpbroadcastb
+    { TTI::SK_Broadcast, MVT::v16i8,  { 1, 3, 1, 1 } }, // vpbroadcastb
+
+    { TTI::SK_Reverse, MVT::v4f64,    { 1, 6, 1, 2 } }, // vpermpd
+    { TTI::SK_Reverse, MVT::v8f32,    { 2, 7, 2, 4 } }, // vpermps
+    { TTI::SK_Reverse, MVT::v4i64,    { 1, 6, 1, 2 } }, // vpermq
+    { TTI::SK_Reverse, MVT::v8i32,    { 2, 7, 2, 4 } }, // vpermd
+    { TTI::SK_Reverse, MVT::v16i16,   { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+    { TTI::SK_Reverse, MVT::v16f16,   { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+    { TTI::SK_Reverse, MVT::v32i8,    { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
 
     { TTI::SK_Select, MVT::v16i16,    { 1, 1, 1, 1 } }, // vpblendvb
     { TTI::SK_Select, MVT::v16f16,    { 1, 1, 1, 1 } }, // vpblendvb
@@ -2077,23 +2088,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return LT.first * *KindCost;
 
   static const CostKindTblEntry AVX1ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
-      {TTI::SK_Broadcast, MVT::v8f32,  {2,2,2,2}}, // vperm2f128 + vpermilps
-      {TTI::SK_Broadcast, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
-      {TTI::SK_Broadcast, MVT::v8i32,  {2,2,2,2}}, // vperm2f128 + vpermilps
-      {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
-      {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
-      {TTI::SK_Broadcast, MVT::v32i8,  {2,2,2,2}}, // vpshufb + vinsertf128
-
-      {TTI::SK_Reverse, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
-      {TTI::SK_Reverse, MVT::v8f32,  {2,2,2,2}}, // vperm2f128 + vpermilps
-      {TTI::SK_Reverse, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
-      {TTI::SK_Reverse, MVT::v8i32,  {2,2,2,2}}, // vperm2f128 + vpermilps
-      {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+      {TTI::SK_Broadcast, MVT::v4f64,  {2,3,2,3}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Broadcast, MVT::v8f32,  {2,3,2,3}}, // vperm2f128 + vpermilps
+      {TTI::SK_Broadcast, MVT::v4i64,  {2,3,2,3}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Broadcast, MVT::v8i32,  {2,3,2,3}}, // vperm2f128 + vpermilps
+      {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
+      {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
+      {TTI::SK_Broadcast, MVT::v32i8,  {3,4,3,6}}, // vpshufb + vinsertf128
+
+      {TTI::SK_Reverse, MVT::v4f64,  {2,6,2,2}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Reverse, MVT::v8f32,  {2,7,2,4}}, // vperm2f128 + vpermilps
+      {TTI::SK_Reverse, MVT::v4i64,  {2,6,2,2}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Reverse, MVT::v8i32,  {2,7,2,4}}, // vperm2f128 + vpermilps
+      {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
                                                  // + vinsertf128
-      {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+      {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
                                                  // + vinsertf128
-      {TTI::SK_Reverse, MVT::v32i8,  {4,4,4,4}}, // vextractf128 + 2*pshufb
+      {TTI::SK_Reverse, MVT::v32i8,  {2,9,5,5}}, // vextractf128 + 2*pshufb
                                                  // + vinsertf128
 
       {TTI::SK_Select, MVT::v4i64,  {1,1,1,1}}, // vblendpd
@@ -2156,13 +2167,13 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return LT.first * *KindCost;
 
   static const CostKindTblEntry SSSE3ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
-      {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
-      {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 2, 2}}, // pshufb
+      {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 2, 2}}, // pshufb
+      {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 2, 2}}, // pshufb
 
-      {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
-      {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
-      {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
+      {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
+      {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
 
       {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
       {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
@@ -2192,16 +2203,16 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
       {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
       {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
-      {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd
-      {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd
-      {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd
+      {TTI::SK_Broadcast, MVT::v8i16, {1, 2, 2, 2}}, // pshuflw + pshufd
+      {TTI::SK_Broadcast, MVT::v8f16, {1, 2, 2, 2}}, // pshuflw + pshufd
+      {TTI::SK_Broadcast, MVT::v16i8, {2, 3, 3, 4}}, // unpck + pshuflw + pshufd
 
       {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
       {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
       {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
-      {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
-      {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
-      {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw
+      {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+      {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+      {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
                                                    // + 2*pshufd + 2*unpck + packus
 
       {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 17c9833..d6afb8a 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -858,16 +858,15 @@ void RISCVISAInfo::updateImplication() {
     StringRef ExtName = WorkList.pop_back_val();
     auto Range = std::equal_range(std::begin(ImpliedExts),
                                   std::end(ImpliedExts), ExtName);
-    std::for_each(Range.first, Range.second,
-                  [&](const ImpliedExtsEntry &Implied) {
-                    const char *ImpliedExt = Implied.ImpliedExt;
-                    auto [It, Inserted] = Exts.try_emplace(ImpliedExt);
-                    if (!Inserted)
-                      return;
-                    auto Version = findDefaultVersion(ImpliedExt);
-                    It->second = *Version;
-                    WorkList.push_back(ImpliedExt);
-                  });
+    for (const ImpliedExtsEntry &Implied : llvm::make_range(Range)) {
+      const char *ImpliedExt = Implied.ImpliedExt;
+      auto [It, Inserted] = Exts.try_emplace(ImpliedExt);
+      if (!Inserted)
+        continue;
+      auto Version = findDefaultVersion(ImpliedExt);
+      It->second = *Version;
+      WorkList.push_back(ImpliedExt);
+    }
   }
 
   // Add Zcd if C and D are enabled.
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index ee6651c..6acb0bc 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -277,6 +277,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
   case PC: return "pc";
   case SCEI: return "scei";
   case SUSE: return "suse";
+  case Meta:
+    return "meta";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -390,6 +392,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case OpenHOS: return "ohos";
   case PAuthTest:
     return "pauthtest";
+  case MTIA:
+    return "mtia";
   case LLVM:
     return "llvm";
   case Mlibc:
@@ -677,6 +681,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
       .Case("suse", Triple::SUSE)
       .Case("oe", Triple::OpenEmbedded)
       .Case("intel", Triple::Intel)
+      .Case("meta", Triple::Meta)
       .Default(Triple::UnknownVendor);
 }
 
@@ -780,6 +785,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
       .StartsWith("pauthtest", Triple::PAuthTest)
       .StartsWith("llvm", Triple::LLVM)
       .StartsWith("mlibc", Triple::Mlibc)
+      .StartsWith("mtia", Triple::MTIA)
       .Default(Triple::UnknownEnvironment);
 }
 
diff --git a/llvm/lib/TextAPI/SymbolSet.cpp b/llvm/lib/TextAPI/SymbolSet.cpp
index 2e0b416..f21a061 100644
--- a/llvm/lib/TextAPI/SymbolSet.cpp
+++ b/llvm/lib/TextAPI/SymbolSet.cpp
@@ -11,6 +11,11 @@
 using namespace llvm;
 using namespace llvm::MachO;
 
+SymbolSet::~SymbolSet() {
+  for (auto &[Key, Sym] : Symbols)
+    Sym->~Symbol();
+}
+
 Symbol *SymbolSet::addGlobalImpl(EncodeKind Kind, StringRef Name,
                                  SymbolFlags Flags) {
   Name = copyString(Name);
diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index b3910c4..d895cd7 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -37,6 +37,16 @@
 //    memory that ends up in one of the runtime equivalents, since this can
 //    happen if e.g. a library that was compiled without interposition returns
 //    an allocation that can be validly passed to `free`.
+//
+// 3. MathFixup (required): Some accelerators might have an incomplete
+//    implementation for the intrinsics used to implement some of the math
+//    functions in <cmath> / their corresponding libcall lowerings. Since this
+//    can vary quite significantly between accelerators, we replace calls to a
+//    set of intrinsics / lib functions known to be problematic with calls to a
+//    HIPSTDPAR specific forwarding layer, which gives an uniform interface for
+//    accelerators to implement in their own runtime components. This pass
+//    should run before AcceleratorCodeSelection so as to prevent the spurious
+//    removal of the HIPSTDPAR specific forwarding functions.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
@@ -49,6 +59,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -519,3 +530,110 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
 
   return PreservedAnalyses::none();
 }
+
+static constexpr std::pair<StringLiteral, StringLiteral> MathLibToHipStdPar[]{
+    {"acosh", "__hipstdpar_acosh_f64"},
+    {"acoshf", "__hipstdpar_acosh_f32"},
+    {"asinh", "__hipstdpar_asinh_f64"},
+    {"asinhf", "__hipstdpar_asinh_f32"},
+    {"atanh", "__hipstdpar_atanh_f64"},
+    {"atanhf", "__hipstdpar_atanh_f32"},
+    {"cbrt", "__hipstdpar_cbrt_f64"},
+    {"cbrtf", "__hipstdpar_cbrt_f32"},
+    {"erf", "__hipstdpar_erf_f64"},
+    {"erff", "__hipstdpar_erf_f32"},
+    {"erfc", "__hipstdpar_erfc_f64"},
+    {"erfcf", "__hipstdpar_erfc_f32"},
+    {"fdim", "__hipstdpar_fdim_f64"},
+    {"fdimf", "__hipstdpar_fdim_f32"},
+    {"expm1", "__hipstdpar_expm1_f64"},
+    {"expm1f", "__hipstdpar_expm1_f32"},
+    {"hypot", "__hipstdpar_hypot_f64"},
+    {"hypotf", "__hipstdpar_hypot_f32"},
+    {"ilogb", "__hipstdpar_ilogb_f64"},
+    {"ilogbf", "__hipstdpar_ilogb_f32"},
+    {"lgamma", "__hipstdpar_lgamma_f64"},
+    {"lgammaf", "__hipstdpar_lgamma_f32"},
+    {"log1p", "__hipstdpar_log1p_f64"},
+    {"log1pf", "__hipstdpar_log1p_f32"},
+    {"logb", "__hipstdpar_logb_f64"},
+    {"logbf", "__hipstdpar_logb_f32"},
+    {"nextafter", "__hipstdpar_nextafter_f64"},
+    {"nextafterf", "__hipstdpar_nextafter_f32"},
+    {"nexttoward", "__hipstdpar_nexttoward_f64"},
+    {"nexttowardf", "__hipstdpar_nexttoward_f32"},
+    {"remainder", "__hipstdpar_remainder_f64"},
+    {"remainderf", "__hipstdpar_remainder_f32"},
+    {"remquo", "__hipstdpar_remquo_f64"},
+    {"remquof", "__hipstdpar_remquo_f32"},
+    {"scalbln", "__hipstdpar_scalbln_f64"},
+    {"scalblnf", "__hipstdpar_scalbln_f32"},
+    {"scalbn", "__hipstdpar_scalbn_f64"},
+    {"scalbnf", "__hipstdpar_scalbn_f32"},
+    {"tgamma", "__hipstdpar_tgamma_f64"},
+    {"tgammaf", "__hipstdpar_tgamma_f32"}};
+
+PreservedAnalyses HipStdParMathFixupPass::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  if (M.empty())
+    return PreservedAnalyses::all();
+
+  SmallVector<std::pair<Function *, std::string>> ToReplace;
+  for (auto &&F : M) {
+    if (!F.hasName())
+      continue;
+
+    StringRef N = F.getName();
+    Intrinsic::ID ID = F.getIntrinsicID();
+
+    switch (ID) {
+    case Intrinsic::not_intrinsic: {
+      auto It =
+          find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; });
+      if (It == std::cend(MathLibToHipStdPar))
+        continue;
+      ToReplace.emplace_back(&F, It->second);
+      break;
+    }
+    case Intrinsic::acos:
+    case Intrinsic::asin:
+    case Intrinsic::atan:
+    case Intrinsic::atan2:
+    case Intrinsic::cosh:
+    case Intrinsic::modf:
+    case Intrinsic::sinh:
+    case Intrinsic::tan:
+    case Intrinsic::tanh:
+      break;
+    default: {
+      if (F.getReturnType()->isDoubleTy()) {
+        switch (ID) {
+        case Intrinsic::cos:
+        case Intrinsic::exp:
+        case Intrinsic::exp2:
+        case Intrinsic::log:
+        case Intrinsic::log10:
+        case Intrinsic::log2:
+        case Intrinsic::pow:
+        case Intrinsic::sin:
+          break;
+        default:
+          continue;
+        }
+        break;
+      }
+      continue;
+    }
+    }
+
+    ToReplace.emplace_back(&F, N);
+    llvm::replace(ToReplace.back().second, '.', '_');
+    StringRef Prefix = "llvm";
+    ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar");
+  }
+  for (auto &&[F, NewF] : ToReplace)
+    F->replaceAllUsesWith(
+        M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee());
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index f43202e..8262c8c 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1863,7 +1863,6 @@ void AttributeInferer::run(const SCCNodeSet &SCCNodes,
 
 struct SCCNodesResult {
   SCCNodeSet SCCNodes;
-  bool HasUnknownCall;
 };
 
 } // end anonymous namespace
@@ -2227,29 +2226,13 @@ static void addWillReturn(const SCCNodeSet &SCCNodes,
 
 static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
   SCCNodesResult Res;
-  Res.HasUnknownCall = false;
   for (Function *F : Functions) {
     if (!F || F->hasOptNone() || F->hasFnAttribute(Attribute::Naked) ||
         F->isPresplitCoroutine()) {
-      // Treat any function we're trying not to optimize as if it were an
-      // indirect call and omit it from the node set used below.
-      Res.HasUnknownCall = true;
+      // Omit any functions we're trying not to optimize from the set.
       continue;
     }
-    // Track whether any functions in this SCC have an unknown call edge.
-    // Note: if this is ever a performance hit, we can common it with
-    // subsequent routines which also do scans over the instructions of the
-    // function.
-    if (!Res.HasUnknownCall) {
-      for (Instruction &I : instructions(*F)) {
-        if (auto *CB = dyn_cast<CallBase>(&I)) {
-          if (!CB->getCalledFunction()) {
-            Res.HasUnknownCall = true;
-            break;
-          }
-        }
-      }
-    }
+
     Res.SCCNodes.insert(F);
   }
   return Res;
@@ -2282,15 +2265,10 @@ deriveAttrsInPostOrder(ArrayRef<Function *> Functions, AARGetterT &&AARGetter,
   addColdAttrs(Nodes.SCCNodes, Changed);
   addWillReturn(Nodes.SCCNodes, Changed);
   addNoUndefAttrs(Nodes.SCCNodes, Changed);
-
-  // If we have no external nodes participating in the SCC, we can deduce some
-  // more precise attributes as well.
-  if (!Nodes.HasUnknownCall) {
-    addNoAliasAttrs(Nodes.SCCNodes, Changed);
-    addNonNullAttrs(Nodes.SCCNodes, Changed);
-    inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
-    addNoRecurseAttrs(Nodes.SCCNodes, Changed);
-  }
+  addNoAliasAttrs(Nodes.SCCNodes, Changed);
+  addNonNullAttrs(Nodes.SCCNodes, Changed);
+  inferAttrsFromFunctionBodies(Nodes.SCCNodes, Changed);
+  addNoRecurseAttrs(Nodes.SCCNodes, Changed);
 
   // Finally, infer the maximal set of attributes from the ones we've inferred
   // above.  This is handling the cases where one attribute on a signature
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b803c97..2b392fe 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId,
           "Number of missing alloc nodes for context ids");
 STATISTIC(SkippedCallsCloning,
           "Number of calls skipped during cloning due to unexpected operand");
+STATISTIC(MismatchedCloneAssignments,
+          "Number of callsites assigned to call multiple non-matching clones");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) {
   return F.getName().contains(MemProfCloneSuffix);
 }
 
+// Return the clone number of the given function by extracting it from the
+// memprof suffix. Assumes the caller has already confirmed it is a memprof
+// clone.
+static unsigned getMemProfCloneNum(const Function &F) {
+  assert(isMemProfClone(F));
+  auto Pos = F.getName().find_last_of('.');
+  assert(Pos > 0);
+  unsigned CloneNo;
+  bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo);
+  assert(!Err);
+  (void)Err;
+  return CloneNo;
+}
+
 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
                                                  const Instruction *Call,
                                                  unsigned CloneNo) const {
@@ -2073,14 +2089,14 @@ std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
                                                 unsigned CloneNo) const {
   auto VI = FSToVIMap.find(Func);
   assert(VI != FSToVIMap.end());
+  std::string CallerName = getMemProfFuncName(VI->second.name(), CloneNo);
   if (isa<AllocInfo *>(Call))
-    return (VI->second.name() + " -> alloc").str();
+    return CallerName + " -> alloc";
   else {
     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
-    return (VI->second.name() + " -> " +
-            getMemProfFuncName(Callsite->Callee.name(),
-                               Callsite->Clones[CloneNo]))
-        .str();
+    return CallerName + " -> " +
+           getMemProfFuncName(Callsite->Callee.name(),
+                              Callsite->Clones[CloneNo]);
   }
 }
 
@@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
 
 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
                                             FuncInfo CalleeFunc) {
-  if (CalleeFunc.cloneNo() > 0)
+  auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction();
+  auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+  if (isMemProfClone(*CurF)) {
+    // If we already assigned this callsite to call a specific non-default
+    // clone (i.e. not the original function which is clone 0), ensure that we
+    // aren't trying to now update it to call a different clone, which is
+    // indicative of a bug in the graph or function assignment.
+    auto CurCalleeCloneNo = getMemProfCloneNum(*CurF);
+    if (CurCalleeCloneNo != NewCalleeCloneNo) {
+      LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+                        << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+                        << "\n");
+      MismatchedCloneAssignments++;
+    }
+  }
+  if (NewCalleeCloneNo > 0)
     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
   OREGetter(CallerCall.call()->getFunction())
       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
@@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
   assert(CI &&
          "Caller cannot be an allocation which should not have profiled calls");
   assert(CI->Clones.size() > CallerCall.cloneNo());
-  CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
+  auto NewCalleeCloneNo = CalleeFunc.cloneNo();
+  auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()];
+  // If we already assigned this callsite to call a specific non-default
+  // clone (i.e. not the original function which is clone 0), ensure that we
+  // aren't trying to now update it to call a different clone, which is
+  // indicative of a bug in the graph or function assignment.
+  if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) {
+    LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was "
+                      << CurCalleeCloneNo << " now " << NewCalleeCloneNo
+                      << "\n");
+    MismatchedCloneAssignments++;
+  }
+  CurCalleeCloneNo = NewCalleeCloneNo;
 }
 
 // Update the debug information attached to NewFunc to use the clone Name. Note
@@ -4457,14 +4500,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
   };
 
+  // Information for a single clone of this Func.
+  struct FuncCloneInfo {
+    // The function clone.
+    FuncInfo FuncClone;
+    // Remappings of each call of interest (from original uncloned call to the
+    // corresponding cloned call in this function clone).
+    std::map<CallInfo, CallInfo> CallMap;
+  };
+
   // Walk all functions for which we saw calls with memprof metadata, and handle
   // cloning for each of its calls.
   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
     FuncInfo OrigFunc(Func);
-    // Map from each clone of OrigFunc to a map of remappings of each call of
-    // interest (from original uncloned call to the corresponding cloned call in
-    // that function clone).
-    std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
+    // Map from each clone number of OrigFunc to information about that function
+    // clone (the function clone FuncInfo and call remappings). The index into
+    // the vector is the clone number, as function clones are created and
+    // numbered sequentially.
+    std::vector<FuncCloneInfo> FuncCloneInfos;
     for (auto &Call : CallsWithMetadata) {
       ContextNode *Node = getNodeForInst(Call);
       // Skip call if we do not have a node for it (all uses of its stack ids
@@ -4488,8 +4541,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
         // Record the clone of callsite node assigned to this function clone.
         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
 
-        assert(FuncClonesToCallMap.count(FuncClone));
-        std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
+        assert(FuncCloneInfos.size() > FuncClone.cloneNo());
+        std::map<CallInfo, CallInfo> &CallMap =
+            FuncCloneInfos[FuncClone.cloneNo()].CallMap;
         CallInfo CallClone(Call);
         if (auto It = CallMap.find(Call); It != CallMap.end())
           CallClone = It->second;
@@ -4528,10 +4582,10 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
         // than existing function clones, which would have been assigned to an
         // earlier clone in the list (we assign callsite clones to function
         // clones greedily).
-        if (FuncClonesToCallMap.size() < NodeCloneCount) {
+        if (FuncCloneInfos.size() < NodeCloneCount) {
           // If this is the first callsite copy, assign to original function.
           if (NodeCloneCount == 1) {
-            // Since FuncClonesToCallMap is empty in this case, no clones have
+            // Since FuncCloneInfos is empty in this case, no clones have
             // been created for this function yet, and no callers should have
             // been assigned a function clone for this callee node yet.
             assert(llvm::none_of(
@@ -4540,7 +4594,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
                 }));
             // Initialize with empty call map, assign Clone to original function
             // and its callers, and skip to the next clone.
-            FuncClonesToCallMap[OrigFunc] = {};
+            FuncCloneInfos.push_back({OrigFunc, {}});
             AssignCallsiteCloneToFuncClone(
                 OrigFunc, Call, Clone,
                 AllocationCallToContextNodeMap.count(Call));
@@ -4572,14 +4626,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
           }
 
           // Clone function and save it along with the CallInfo map created
-          // during cloning in the FuncClonesToCallMap.
+          // during cloning in the FuncCloneInfos.
           std::map<CallInfo, CallInfo> NewCallMap;
-          unsigned CloneNo = FuncClonesToCallMap.size();
+          unsigned CloneNo = FuncCloneInfos.size();
           assert(CloneNo > 0 && "Clone 0 is the original function, which "
                                 "should already exist in the map");
           FuncInfo NewFuncClone = cloneFunctionForCallsite(
               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
-          FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
+          FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)});
           FunctionClonesAnalysis++;
           Changed = true;
 
@@ -4681,7 +4735,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
               CallInfo OrigCall(Callee->getOrigNode()->Call);
               OrigCall.setCloneNo(0);
               std::map<CallInfo, CallInfo> &CallMap =
-                  FuncClonesToCallMap[NewFuncClone];
+                  FuncCloneInfos[NewFuncClone.cloneNo()].CallMap;
               assert(CallMap.count(OrigCall));
               CallInfo NewCall(CallMap[OrigCall]);
               assert(NewCall);
@@ -4703,6 +4757,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
           // where the callers were assigned to different clones of a function.
         }
 
+        auto FindFirstAvailFuncClone = [&]() {
+          // Find first function in FuncCloneInfos without an assigned
+          // clone of this callsite Node. We should always have one
+          // available at this point due to the earlier cloning when the
+          // FuncCloneInfos size was smaller than the clone number.
+          for (auto &CF : FuncCloneInfos) {
+            if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone))
+              return CF.FuncClone;
+          }
+          llvm_unreachable(
+              "Expected an available func clone for this callsite clone");
+        };
+
         // See if we can use existing function clone. Walk through
         // all caller edges to see if any have already been assigned to
         // a clone of this callsite's function. If we can use it, do so. If not,
@@ -4819,16 +4886,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
             // clone of OrigFunc for another caller during this iteration over
             // its caller edges.
             if (!FuncCloneAssignedToCurCallsiteClone) {
-              // Find first function in FuncClonesToCallMap without an assigned
-              // clone of this callsite Node. We should always have one
-              // available at this point due to the earlier cloning when the
-              // FuncClonesToCallMap size was smaller than the clone number.
-              for (auto &CF : FuncClonesToCallMap) {
-                if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
-                  FuncCloneAssignedToCurCallsiteClone = CF.first;
-                  break;
-                }
-              }
+              FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
               assert(FuncCloneAssignedToCurCallsiteClone);
               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
               AssignCallsiteCloneToFuncClone(
@@ -4842,6 +4900,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
                                        FuncCloneAssignedToCurCallsiteClone);
           }
         }
+        // If we didn't assign a function clone to this callsite clone yet, e.g.
+        // none of its callers has a non-null call, do the assignment here.
+        // We want to ensure that every callsite clone is assigned to some
+        // function clone, so that the call updates below work as expected.
+        // In particular if this is the original callsite, we want to ensure it
+        // is assigned to the original function, otherwise the original function
+        // will appear available for assignment to other callsite clones,
+        // leading to unintended effects. For one, the unknown and not updated
+        // callers will call into cloned paths leading to the wrong hints,
+        // because they still call the original function (clone 0). Also,
+        // because all callsites start out as being clone 0 by default, we can't
+        // easily distinguish between callsites explicitly assigned to clone 0
+        // vs those never assigned, which can lead to multiple updates of the
+        // calls when invoking updateCall below, with mismatched clone values.
+        // TODO: Add a flag to the callsite nodes or some other mechanism to
+        // better distinguish and identify callsite clones that are not getting
+        // assigned to function clones as expected.
+        if (!FuncCloneAssignedToCurCallsiteClone) {
+          FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone();
+          assert(FuncCloneAssignedToCurCallsiteClone &&
+                 "No available func clone for this callsite clone");
+          AssignCallsiteCloneToFuncClone(
+              FuncCloneAssignedToCurCallsiteClone, Call, Clone,
+              /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call));
+        }
       }
       if (VerifyCCG) {
         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 033ef8b..a43a6ee 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -708,10 +708,14 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
   auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
   if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) &&
       all_equal(Shuf->getShuffleMask()) &&
-      Shuf->getType() == Shuf->getOperand(0)->getType()) {
+      ElementCount::isKnownGE(Shuf->getType()->getElementCount(),
+                              cast<VectorType>(Shuf->getOperand(0)->getType())
+                                  ->getElementCount())) {
     // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask
     // trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask
-    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+    Type *NewTruncTy = Shuf->getOperand(0)->getType()->getWithNewType(
+        Trunc.getType()->getScalarType());
+    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), NewTruncTy);
     return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask());
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index fe3315e..4e5a8d1 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1459,8 +1459,6 @@ void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
     size_t Size = memtag::getAllocaSizeInBytes(*AI);
     size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
 
-    Value *AICast = IRB.CreatePointerCast(AI, PtrTy);
-
     auto HandleLifetime = [&](IntrinsicInst *II) {
       // Set the lifetime intrinsic to cover the whole alloca. This reduces the
       // set of assumptions we need to make about the lifetime. Without this we
@@ -1473,14 +1471,13 @@ void HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
       // one set of start / end in any execution (i.e. the ends are not
       // reachable from each other), so this will not cause any problems.
       II->setArgOperand(0, ConstantInt::get(Int64Ty, AlignedSize));
-      II->setArgOperand(1, AICast);
     };
     llvm::for_each(Info.LifetimeStart, HandleLifetime);
     llvm::for_each(Info.LifetimeEnd, HandleLifetime);
 
-    AI->replaceUsesWithIf(Replacement, [AICast, AILong](const Use &U) {
+    AI->replaceUsesWithIf(Replacement, [AILong](const Use &U) {
       auto *User = U.getUser();
-      return User != AILong && User != AICast && !isa<LifetimeIntrinsic>(User);
+      return User != AILong && !isa<LifetimeIntrinsic>(User);
     });
 
     memtag::annotateDebugRecords(Info, retagMask(N));
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index f6780c0..ce1d9f1 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -456,7 +456,7 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
   if (DisableMemOPOPT)
     return false;
 
-  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+  if (F.hasOptSize())
     return false;
   MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
   MemOPSizeOpt.perform();
diff --git a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt
index 80867db..4274667 100644
--- a/llvm/lib/Transforms/ObjCARC/CMakeLists.txt
+++ b/llvm/lib/Transforms/ObjCARC/CMakeLists.txt
@@ -2,7 +2,6 @@ add_llvm_component_library(LLVMObjCARCOpts
   ObjCARC.cpp
   ObjCARCOpts.cpp
   ObjCARCExpand.cpp
-  ObjCARCAPElim.cpp
   ObjCARCContract.cpp
   DependencyAnalysis.cpp
   ProvenanceAnalysis.cpp
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
deleted file mode 100644
index dceb2eb..0000000
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic
-/// Reference Counting and is a system for managing reference counts for objects
-/// in Objective C.
-///
-/// This specific file implements optimizations which remove extraneous
-/// autorelease pools.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
-#include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/ObjCARC.h"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-#define DEBUG_TYPE "objc-arc-ap-elim"
-
-namespace {
-
-/// Interprocedurally determine if calls made by the given call site can
-/// possibly produce autoreleases.
-bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) {
-  if (const Function *Callee = CB.getCalledFunction()) {
-    if (!Callee->hasExactDefinition())
-      return true;
-    for (const BasicBlock &BB : *Callee) {
-      for (const Instruction &I : BB)
-        if (const CallBase *JCB = dyn_cast<CallBase>(&I))
-          // This recursion depth limit is arbitrary. It's just great
-          // enough to cover known interesting testcases.
-          if (Depth < 3 && !JCB->onlyReadsMemory() &&
-              MayAutorelease(*JCB, Depth + 1))
-            return true;
-    }
-    return false;
-  }
-
-  return true;
-}
-
-bool OptimizeBB(BasicBlock *BB) {
-  bool Changed = false;
-
-  Instruction *Push = nullptr;
-  for (Instruction &Inst : llvm::make_early_inc_range(*BB)) {
-    switch (GetBasicARCInstKind(&Inst)) {
-    case ARCInstKind::AutoreleasepoolPush:
-      Push = &Inst;
-      break;
-    case ARCInstKind::AutoreleasepoolPop:
-      // If this pop matches a push and nothing in between can autorelease,
-      // zap the pair.
-      if (Push && cast<CallInst>(&Inst)->getArgOperand(0) == Push) {
-        Changed = true;
-        LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
-                             "autorelease pair:\n"
-                             "                           Pop: "
-                          << Inst << "\n"
-                          << "                           Push: " << *Push
-                          << "\n");
-        Inst.eraseFromParent();
-        Push->eraseFromParent();
-      }
-      Push = nullptr;
-      break;
-    case ARCInstKind::CallOrUser:
-      if (MayAutorelease(cast<CallBase>(Inst)))
-        Push = nullptr;
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool runImpl(Module &M) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!ModuleHasARC(M))
-    return false;
-  // Find the llvm.global_ctors variable, as the first step in
-  // identifying the global constructors. In theory, unnecessary autorelease
-  // pools could occur anywhere, but in practice it's pretty rare. Global
-  // ctors are a place where autorelease pools get inserted automatically,
-  // so it's pretty common for them to be unnecessary, and it's pretty
-  // profitable to eliminate them.
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
-  if (!GV)
-    return false;
-
-  assert(GV->hasDefinitiveInitializer() &&
-         "llvm.global_ctors is uncooperative!");
-
-  bool Changed = false;
-
-  // Dig the constructor functions out of GV's initializer.
-  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
-  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
-       OI != OE; ++OI) {
-    Value *Op = *OI;
-    // llvm.global_ctors is an array of three-field structs where the second
-    // members are constructor functions.
-    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
-    // If the user used a constructor function with the wrong signature and
-    // it got bitcasted or whatever, look the other way.
-    if (!F)
-      continue;
-    // Only look at function definitions.
-    if (F->isDeclaration())
-      continue;
-    // Only look at functions with one basic block.
-    if (std::next(F->begin()) != F->end())
-      continue;
-    // Ok, a single-block constructor function definition. Try to optimize it.
-    Changed |= OptimizeBB(&F->front());
-  }
-
-  return Changed;
-}
-
-} // namespace
-
-PreservedAnalyses ObjCARCAPElimPass::run(Module &M, ModuleAnalysisManager &AM) {
-  if (!runImpl(M))
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
-  return PA;
-}
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d6bd92d..b5eb647 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1176,6 +1176,28 @@ private:
     return true;
   }
 
+  /// This function fixes PHI nodes after fusion in \p SafeToSink.
+  /// \p SafeToSink instructions are the instructions that are to be moved past
+  /// the fused loop. Thus, the PHI nodes in \p SafeToSink should be updated to
+  /// receive values from the fused loop if they are currently taking values
+  /// from the first loop (i.e. FC0)'s latch.
+  void fixPHINodes(ArrayRef<Instruction *> SafeToSink,
+                   const FusionCandidate &FC0,
+                   const FusionCandidate &FC1) const {
+    for (Instruction *Inst : SafeToSink) {
+      // No update needed for non-PHI nodes.
+      PHINode *Phi = dyn_cast<PHINode>(Inst);
+      if (!Phi)
+        continue;
+      for (unsigned I = 0; I < Phi->getNumIncomingValues(); I++) {
+        if (Phi->getIncomingBlock(I) != FC0.Latch)
+          continue;
+        assert(FC1.Latch && "FC1 latch is not set");
+        Phi->setIncomingBlock(I, FC1.Latch);
+      }
+    }
+  }
+
   /// Collect instructions in the \p FC1 Preheader that can be hoisted
   /// to the \p FC0 Preheader or sunk into the \p FC1 Body
   bool collectMovablePreheaderInsts(
@@ -1481,6 +1503,9 @@ private:
       assert(I->getParent() == FC1.Preheader);
       I->moveBefore(*FC1.ExitBlock, FC1.ExitBlock->getFirstInsertionPt());
     }
+    // PHI nodes in SinkInsts need to be updated to receive values from the
+    // fused loop.
+    fixPHINodes(SinkInsts, FC0, FC1);
   }
 
   /// Determine if two fusion candidates have identical guards
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 70e9eee..08446cc 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -70,6 +70,13 @@ namespace {
 
 using LoopVector = SmallVector<Loop *, 8>;
 
+/// A list of direction vectors. Each entry represents a direction vector
+/// corresponding to one or more dependencies existing in the loop nest. The
+/// length of all direction vectors is equal and is N + 1, where N is the depth
+/// of the loop nest. The first N elements correspond to the dependency
+/// direction of each N loops. The last one indicates whether this entry is
+/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
+/// is defined in LoopAccessAnalysis.
 // TODO: Check if we can use a sparse matrix here.
 using CharMatrix = std::vector<std::vector<char>>;
 
@@ -126,11 +133,33 @@ static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
 
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
-    for (auto D : Row)
+    // Drop the last element because it is a flag indicating whether this is
+    // forward dependency or not, which doesn't affect the legality check.
+    for (char D : drop_end(Row))
       LLVM_DEBUG(dbgs() << D << " ");
     LLVM_DEBUG(dbgs() << "\n");
   }
 }
+
+/// Return true if \p Src appears before \p Dst in the same basic block.
+/// Precondition: \p Src and \Dst are distinct instructions within the same
+/// basic block.
+static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && Src != Dst &&
+         "Expected Src and Dst to be different instructions in the same BB");
+
+  bool FoundSrc = false;
+  for (const Instruction &I : *(Src->getParent())) {
+    if (&I == Src) {
+      FoundSrc = true;
+      continue;
+    }
+    if (&I == Dst)
+      return FoundSrc;
+  }
+
+  llvm_unreachable("Dst not found");
+}
 #endif
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
@@ -174,7 +203,10 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     return false;
   }
   ValueVector::iterator I, IE, J, JE;
-  StringSet<> Seen;
+
+  // Manage direction vectors that are already seen. Map each direction vector
+  // to an index of DepMatrix at which it is stored.
+  StringMap<unsigned> Seen;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -228,9 +260,49 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
           Dep.push_back('I');
         }
 
+        // Test whether the dependency is forward or not.
+        bool IsKnownForward = true;
+        if (Src->getParent() != Dst->getParent()) {
+          // In general, when Src and Dst are in different BBs, the execution
+          // order of them within a single iteration is not guaranteed. Treat
+          // conservatively as not-forward dependency in this case.
+          IsKnownForward = false;
+        } else {
+          // Src and Dst are in the same BB. If they are the different
+          // instructions, Src should appear before Dst in the BB as they are
+          // stored to MemInstr in that order.
+          assert((Src == Dst || inThisOrder(Src, Dst)) &&
+                 "Unexpected instructions");
+
+          // If the Dependence object is reversed (due to normalization), it
+          // represents the dependency from Dst to Src, meaning it is a backward
+          // dependency. Otherwise it should be a forward dependency.
+          bool IsReversed = D->getSrc() != Src;
+          if (IsReversed)
+            IsKnownForward = false;
+        }
+
+        // Initialize the last element. Assume forward dependencies only; it
+        // will be updated later if there is any non-forward dependency.
+        Dep.push_back('<');
+
+        // The last element should express the "summary" among one or more
+        // direction vectors whose first N elements are the same (where N is
+        // the depth of the loop nest). Hence we exclude the last element from
+        // the Seen map.
+        auto [Ite, Inserted] = Seen.try_emplace(
+            StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
+
         // Make sure we only add unique entries to the dependency matrix.
-        if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
+        if (Inserted)
           DepMatrix.push_back(Dep);
+
+        // If we cannot prove that this dependency is forward, change the last
+        // element of the corresponding entry. Since a `[... *]` dependency
+        // includes a `[... <]` dependency, we do not need to keep both and
+        // change the existing entry instead.
+        if (!IsKnownForward)
+          DepMatrix[Ite->second].back() = '*';
       }
     }
   }
@@ -281,11 +353,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
       continue;
 
     // Check if the direction vector is lexicographically positive (or zero)
-    // for both before/after exchanged.
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    // for both before/after exchanged. Ignore the last element because it
+    // doesn't affect the legality.
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
     std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
   }
   return true;
@@ -1334,22 +1407,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
 static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
   for (const auto &Dep : DepMatrix) {
     char Dir = Dep[LoopId];
-    if (Dir != 'I' && Dir != '=')
-      return false;
+    char DepType = Dep.back();
+    assert((DepType == '<' || DepType == '*') &&
+           "Unexpected element in dependency vector");
+
+    // There are no loop-carried dependencies.
+    if (Dir == '=' || Dir == 'I')
+      continue;
+
+    // DepType being '<' means that this direction vector represents a forward
+    // dependency. In principle, a loop with '<' direction can be vectorized in
+    // this case.
+    if (Dir == '<' && DepType == '<')
+      continue;
+
+    // We cannot prove that the loop is vectorizable.
+    return false;
   }
   return true;
 }
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
     unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
-  // If the outer loop is not loop independent it is not profitable to move
-  // this to inner position, since doing so would not enable inner loop
-  // parallelism.
+  // If the outer loop cannot be vectorized, it is not profitable to move this
+  // to inner position.
   if (!canVectorize(DepMatrix, OuterLoopId))
     return false;
 
-  // If inner loop has dependence and outer loop is loop independent then it is
-  // profitable to interchange to enable inner loop parallelism.
+  // If the inner loop cannot be vectorized but the outer loop can be, then it
+  // is profitable to interchange to enable inner loop parallelism.
   if (!canVectorize(DepMatrix, InnerLoopId))
     return true;
 
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 84d1c0b..9220abb 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1593,11 +1593,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
             // since both llvm.lifetime.start and llvm.lifetime.end intrinsics
             // practically fill all the bytes of the alloca with an undefined
             // value, although conceptually marked as alive/dead.
-            int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
-            if (Size < 0 || Size == DestSize) {
-              LifetimeMarkers.push_back(UI);
-              continue;
-            }
+            LifetimeMarkers.push_back(UI);
+            continue;
           }
           AAMetadataInstrs.insert(UI);
 
@@ -1614,9 +1611,8 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
     return true;
   };
 
-  // Check that dest has no Mod/Ref, from the alloca to the Store, except full
-  // size lifetime intrinsics. And collect modref inst for the reachability
-  // check.
+  // Check that dest has no Mod/Ref, from the alloca to the Store. And collect
+  // modref inst for the reachability check.
   ModRefInfo DestModRef = ModRefInfo::NoModRef;
   MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
   SmallVector<BasicBlock *, 8> ReachabilityWorklist;
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index a69d649..44e63a0 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -128,6 +129,7 @@ struct PredInfo {
 using BBPredicates = DenseMap<BasicBlock *, PredInfo>;
 using PredMap = DenseMap<BasicBlock *, BBPredicates>;
 using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
+using Val2BBMap = DenseMap<Value *, BasicBlock *>;
 
 // A traits type that is intended to be used in graph algorithms. The graph
 // traits starts at an entry node, and traverses the RegionNodes that are in
@@ -279,7 +281,7 @@ class StructurizeCFG {
   ConstantInt *BoolTrue;
   ConstantInt *BoolFalse;
   Value *BoolPoison;
-
+  const TargetTransformInfo *TTI;
   Function *Func;
   Region *ParentRegion;
 
@@ -301,8 +303,12 @@ class StructurizeCFG {
   PredMap LoopPreds;
   BranchVector LoopConds;
 
+  Val2BBMap HoistedValues;
+
   RegionNode *PrevNode;
 
+  void hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB, BasicBlock *ThenBB);
+
   void orderNodes();
 
   void analyzeLoops(RegionNode *N);
@@ -332,6 +338,8 @@ class StructurizeCFG {
 
   void simplifyAffectedPhis();
 
+  void simplifyHoistedPhis();
+
   DebugLoc killTerminator(BasicBlock *BB);
 
   void changeExit(RegionNode *Node, BasicBlock *NewExit,
@@ -359,7 +367,7 @@ class StructurizeCFG {
 
 public:
   void init(Region *R);
-  bool run(Region *R, DominatorTree *DT);
+  bool run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI);
   bool makeUniformRegion(Region *R, UniformityInfo &UA);
 };
 
@@ -385,8 +393,11 @@ public:
       if (SCFG.makeUniformRegion(R, UA))
         return false;
     }
+    Function *F = R->getEntry()->getParent();
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
     DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    return SCFG.run(R, DT);
+    return SCFG.run(R, DT, TTI);
   }
 
   StringRef getPassName() const override { return "Structurize control flow"; }
@@ -394,7 +405,9 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     if (SkipUniformRegions)
       AU.addRequired<UniformityInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
@@ -403,6 +416,34 @@ public:
 
 } // end anonymous namespace
 
+/// Checks whether an instruction is zero cost instruction and checks if the
+/// operands are from different BB. If so, this instruction can be coalesced
+/// if its hoisted to predecessor block. So, this returns true.
+static bool isHoistableInstruction(Instruction *I, BasicBlock *BB,
+                                   const TargetTransformInfo *TTI) {
+  if (I->getParent() != BB || isa<PHINode>(I))
+    return false;
+
+  // If the instruction is not a zero cost instruction, return false.
+  auto Cost = TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+  InstructionCost::CostType CostVal =
+      Cost.isValid()
+          ? Cost.getValue()
+          : (InstructionCost::CostType)TargetTransformInfo::TCC_Expensive;
+  if (CostVal != 0)
+    return false;
+
+  // Check if any operands are instructions defined in the same block.
+  for (auto &Op : I->operands()) {
+    if (auto *OpI = dyn_cast<Instruction>(Op)) {
+      if (OpI->getParent() == BB)
+        return false;
+    }
+  }
+
+  return true;
+}
+
 char StructurizeCFGLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
@@ -413,6 +454,39 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
 INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
                     "Structurize the CFG", false, false)
 
+/// Structurization can introduce unnecessary VGPR copies due to register
+/// coalescing interference. For example, if the Else block has a zero-cost
+/// instruction and the Then block modifies the VGPR value, only one value is
+/// live at a time in merge block before structurization. After structurization,
+/// the coalescer may incorrectly treat the Then value as live in the Else block
+/// (via the path Then → Flow → Else), leading to unnecessary VGPR copies.
+///
+/// This function examines phi nodes whose incoming values are zero-cost
+/// instructions in the Else block. It identifies such values that can be safely
+/// hoisted and moves them to the nearest common dominator of Then and Else
+/// blocks. A follow-up function after setting PhiNodes assigns the hoisted
+/// value to poison phi nodes along the if→flow edge, aiding register coalescing
+/// and minimizing unnecessary live ranges.
+void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
+                                                     BasicBlock *ThenBB) {
+
+  BasicBlock *ElseSucc = ElseBB->getSingleSuccessor();
+  BasicBlock *CommonDominator = DT->findNearestCommonDominator(ElseBB, ThenBB);
+
+  if (!ElseSucc || !CommonDominator)
+    return;
+  Instruction *Term = CommonDominator->getTerminator();
+  for (PHINode &Phi : ElseSucc->phis()) {
+    Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
+    auto *Inst = dyn_cast<Instruction>(ElseVal);
+    if (!Inst || !isHoistableInstruction(Inst, ElseBB, TTI))
+      continue;
+    Inst->removeFromParent();
+    Inst->insertInto(CommonDominator, Term->getIterator());
+    HoistedValues[Inst] = CommonDominator;
+  }
+}
+
 /// Build up the general order of nodes, by performing a topological sort of the
 /// parent region's nodes, while ensuring that there is no outer cycle node
 /// between any two inner cycle nodes.
@@ -535,7 +609,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
             BasicBlock *Other = Term->getSuccessor(!i);
             if (Visited.count(Other) && !Loops.count(Other) &&
                 !Pred.count(Other) && !Pred.count(P)) {
-
+              hoistZeroCostElseBlockPhiValues(Succ, Other);
               Pred[Other] = {BoolFalse, std::nullopt};
               Pred[P] = {BoolTrue, std::nullopt};
               continue;
@@ -891,6 +965,44 @@ void StructurizeCFG::setPhiValues() {
   AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
 }
 
+/// Updates PHI nodes after hoisted zero cost instructions by replacing poison
+/// entries on Flow nodes with the appropriate hoisted values
+void StructurizeCFG::simplifyHoistedPhis() {
+  for (WeakVH VH : AffectedPhis) {
+    PHINode *Phi = dyn_cast_or_null<PHINode>(VH);
+    if (!Phi || Phi->getNumIncomingValues() != 2)
+      continue;
+
+    for (int i = 0; i < 2; i++) {
+      Value *V = Phi->getIncomingValue(i);
+      auto BBIt = HoistedValues.find(V);
+
+      if (BBIt == HoistedValues.end())
+        continue;
+
+      Value *OtherV = Phi->getIncomingValue(!i);
+      PHINode *OtherPhi = dyn_cast<PHINode>(OtherV);
+      if (!OtherPhi)
+        continue;
+
+      int PoisonValBBIdx = -1;
+      for (size_t i = 0; i < OtherPhi->getNumIncomingValues(); i++) {
+        if (!isa<PoisonValue>(OtherPhi->getIncomingValue(i)))
+          continue;
+        PoisonValBBIdx = i;
+        break;
+      }
+      if (PoisonValBBIdx == -1 ||
+          !DT->dominates(BBIt->second,
+                         OtherPhi->getIncomingBlock(PoisonValBBIdx)))
+        continue;
+
+      OtherPhi->setIncomingValue(PoisonValBBIdx, V);
+      Phi->setIncomingValue(i, OtherV);
+    }
+  }
+}
+
 void StructurizeCFG::simplifyAffectedPhis() {
   bool Changed;
   do {
@@ -1283,12 +1395,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 }
 
 /// Run the transformation for each region found
-bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
+bool StructurizeCFG::run(Region *R, DominatorTree *DT,
+                         const TargetTransformInfo *TTI) {
   if (R->isTopLevelRegion())
     return false;
 
   this->DT = DT;
-
+  this->TTI = TTI;
   Func = R->getEntry()->getParent();
   assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
@@ -1300,6 +1413,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   insertConditions(false);
   insertConditions(true);
   setPhiValues();
+  simplifyHoistedPhis();
   simplifyConditions();
   simplifyAffectedPhis();
   rebuildSSA();
@@ -1349,7 +1463,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
   bool Changed = false;
   DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto &RI = AM.getResult<RegionInfoAnalysis>(F);
-
+  TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
   UniformityInfo *UI = nullptr;
   if (SkipUniformRegions)
     UI = &AM.getResult<UniformityInfoAnalysis>(F);
@@ -1368,7 +1482,7 @@ PreservedAnalyses StructurizeCFGPass::run(Function &F,
       continue;
     }
 
-    Changed |= SCFG.run(R, DT);
+    Changed |= SCFG.run(R, DT, TTI);
   }
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 9fe655e..fca09c6 100644
--- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -498,7 +498,7 @@ bool LibCallsShrinkWrap::perform(CallInst *CI) {
 
 static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
                     DominatorTree *DT) {
-  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+  if (F.hasOptSize())
     return false;
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   LibCallsShrinkWrap CCDCE(TLI, DTU);
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index ed08c0b..571fa11 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
@@ -42,6 +43,7 @@ cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
              "controls the budget that is considered cheap (default = 4)"));
 
 using namespace PatternMatch;
+using namespace SCEVPatternMatch;
 
 PoisonFlags::PoisonFlags(const Instruction *I) {
   NUW = false;
@@ -1224,6 +1226,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
 }
 
 Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
+  Type *STy = S->getType();
   const Loop *L = S->getLoop();
   BasicBlock *EB = L->getExitBlock();
   if (!EB || !EB->getSinglePredecessor() ||
@@ -1231,11 +1234,36 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
     return nullptr;
 
   for (auto &PN : EB->phis()) {
-    if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType())
+    if (!SE.isSCEVable(PN.getType()))
       continue;
-    auto *ExitV = SE.getSCEV(&PN);
-    if (S == ExitV)
-      return &PN;
+    auto *ExitSCEV = SE.getSCEV(&PN);
+    if (!isa<SCEVAddRecExpr>(ExitSCEV))
+      continue;
+    Type *PhiTy = PN.getType();
+    if (STy->isIntegerTy() && PhiTy->isPointerTy())
+      ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy);
+    else if (S->getType() != PN.getType())
+      continue;
+
+    // Check if we can re-use the existing PN, by adjusting it with an expanded
+    // offset, if the offset is simpler.
+    const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV);
+    const SCEV *Op = Diff;
+    match(Diff, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op)));
+    match(Op, m_scev_PtrToInt(m_SCEV(Op)));
+    if (!isa<SCEVConstant, SCEVUnknown>(Op))
+      continue;
+
+    assert(Diff->getType()->isIntegerTy() &&
+           "difference must be of integer type");
+    Value *DiffV = expand(Diff);
+    Value *BaseV = fixupLCSSAFormFor(&PN);
+    if (PhiTy->isPointerTy()) {
+      if (STy->isPointerTy())
+        return Builder.CreatePtrAdd(BaseV, DiffV);
+      BaseV = Builder.CreatePtrToInt(BaseV, DiffV->getType());
+    }
+    return Builder.CreateAdd(BaseV, DiffV);
   }
 
   return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 99a96a8..7b7efb8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,11 +1363,15 @@ public:
                       TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
     if (EVLIsLegal)
       return;
-    // If for some reason EVL mode is unsupported, fallback to
-    // DataWithoutLaneMask to try to vectorize the loop with folded tail
-    // in a generic way.
-    ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
-                              TailFoldingStyle::DataWithoutLaneMask};
+    // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
+    // if it's allowed, or DataWithoutLaneMask otherwise.
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+        ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+      ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
+    else
+      ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+                                TailFoldingStyle::DataWithoutLaneMask};
+
     LLVM_DEBUG(
         dbgs() << "LV: Preference for VP intrinsics indicated. Will "
                   "not try to generate VP Intrinsics "
@@ -2021,6 +2025,9 @@ public:
   /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
   /// outside VPlan.
   std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
+    using namespace llvm::PatternMatch;
+    if (MemRuntimeCheckCond && match(MemRuntimeCheckCond, m_ZeroInt()))
+      return {nullptr, nullptr};
     return {MemRuntimeCheckCond, MemCheckBlock};
   }
 
@@ -4497,19 +4504,17 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   Type *TCType = Legal->getWidestInductionType();
   const SCEV *RemainingIterations = nullptr;
   unsigned MaxTripCount = 0;
-  if (MainLoopVF.isFixed()) {
-    // TODO: extend to support scalable VFs.
-    const SCEV *TC = vputils::getSCEVExprForVPValue(
-        getPlanFor(MainLoopVF).getTripCount(), SE);
-    assert(!isa<SCEVCouldNotCompute>(TC) &&
-           "Trip count SCEV must be computable");
-    RemainingIterations = SE.getURemExpr(
-        TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
-
-    // No iterations left to process in the epilogue.
-    if (RemainingIterations->isZero())
-      return Result;
+  const SCEV *TC =
+      vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
+  assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
+  RemainingIterations =
+      SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+
+  // No iterations left to process in the epilogue.
+  if (RemainingIterations->isZero())
+    return Result;
 
+  if (MainLoopVF.isFixed()) {
     MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
     if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
                             SE.getConstant(TCType, MaxTripCount))) {
@@ -7276,6 +7281,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
+  VPlanTransforms::removeBranchOnConst(BestVPlan);
   VPlanTransforms::narrowInterleaveGroups(
       BestVPlan, BestVF,
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
@@ -10072,12 +10078,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
-  if (LVL.hasUncountableEarlyExit() && UserIC != 1) {
-    UserIC = 1;
-    reportVectorizationInfo("Interleaving not supported for loops "
-                            "with uncountable early exits",
-                            "InterleaveEarlyExitDisabled", ORE, L);
-  }
 
   // Plan how to best vectorize.
   LVP.plan(UserVF, UserIC);
@@ -10095,9 +10095,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     unsigned SelectedIC = std::max(IC, UserIC);
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
-    if (VF.Width.isVector() || SelectedIC > 1)
+    if (VF.Width.isVector() || SelectedIC > 1) {
       Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
 
+      // Bail out early if either the SCEV or memory runtime checks are known to
+      // fail. In that case, the vector loop would never execute.
+      using namespace llvm::PatternMatch;
+      if (Checks.getSCEVChecks().first &&
+          match(Checks.getSCEVChecks().first, m_One()))
+        return false;
+      if (Checks.getMemRuntimeChecks().first &&
+          match(Checks.getMemRuntimeChecks().first, m_One()))
+        return false;
+    }
+
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
@@ -10228,6 +10239,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
           ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
 
+      // TODO: Move to general VPlan pipeline once epilogue loops are also
+      // supported.
+      VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
+                               BestPlan, VF.Width, IC, PSE);
+
       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
 
       ORE->emit([&]() {
@@ -10295,6 +10311,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
                                VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
                                Checks, BestPlan);
+        // TODO: Move to general VPlan pipeline once epilogue loops are also
+        // supported.
+        VPlanTransforms::runPass(VPlanTransforms::materializeVectorTripCount,
+                                 BestPlan, VF.Width, IC, PSE);
+
         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0d0b342..593868f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -206,6 +206,12 @@ static cl::opt<bool> VectorizeNonPowerOf2(
     "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
     cl::desc("Try to vectorize with non-power-of-2 number of elements."));
 
+/// Enables vectorization of copyable elements.
+static cl::opt<bool> VectorizeCopyableElements(
+    "slp-copyable-elements", cl::init(true), cl::Hidden,
+    cl::desc("Try to replace values with the idempotent instructions for "
+             "better vectorization."));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -855,6 +861,13 @@ static std::optional<unsigned> getExtractIndex(const Instruction *E) {
   return *EI->idx_begin();
 }
 
+namespace llvm {
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V);
+} // namespace llvm
+
 namespace {
 /// \returns true if \p Opcode is allowed as part of the main/alternate
 /// instruction for SLP vectorization.
@@ -957,6 +970,33 @@ class BinOpSameOpcodeHelper {
         return Instruction::Xor;
       llvm_unreachable("Cannot find interchangeable instruction.");
     }
+
+    /// Return true if the instruction can be converted to \p Opcode.
+    bool hasCandidateOpcode(unsigned Opcode) const {
+      MaskType Candidate = Mask & SeenBefore;
+      switch (Opcode) {
+      case Instruction::Shl:
+        return Candidate & ShlBIT;
+      case Instruction::AShr:
+        return Candidate & AShrBIT;
+      case Instruction::Mul:
+        return Candidate & MulBIT;
+      case Instruction::Add:
+        return Candidate & AddBIT;
+      case Instruction::Sub:
+        return Candidate & SubBIT;
+      case Instruction::And:
+        return Candidate & AndBIT;
+      case Instruction::Or:
+        return Candidate & OrBIT;
+      case Instruction::Xor:
+        return Candidate & XorBIT;
+      default:
+        break;
+      }
+      llvm_unreachable("Cannot find interchangeable instruction.");
+    }
+
     SmallVector<Value *> getOperand(const Instruction *To) const {
       unsigned ToOpcode = To->getOpcode();
       unsigned FromOpcode = I->getOpcode();
@@ -1117,6 +1157,10 @@ public:
             AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
   }
   unsigned getMainOpcode() const { return MainOp.getOpcode(); }
+  /// Checks if the list of potential opcodes includes \p Opcode.
+  bool hasCandidateOpcode(unsigned Opcode) const {
+    return MainOp.hasCandidateOpcode(Opcode);
+  }
   bool hasAltOp() const { return AltOp.I; }
   unsigned getAltOpcode() const {
     return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
@@ -1152,6 +1196,8 @@ class InstructionsState {
   /// GetVectorCost.
   Instruction *MainOp = nullptr;
   Instruction *AltOp = nullptr;
+  /// Wether the instruction state represents copyable instructions.
+  bool HasCopyables = false;
 
 public:
   Instruction *getMainOp() const {
@@ -1190,9 +1236,11 @@ public:
     if (!I->isBinaryOp())
       return nullptr;
     BinOpSameOpcodeHelper Converter(MainOp);
-    if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp())
-      return MainOp;
-    return AltOp;
+    if (!Converter.add(I) || !Converter.add(MainOp))
+      return nullptr;
+    if (Converter.hasAltOp() && !isAltShuffle())
+      return nullptr;
+    return Converter.hasAltOp() ? AltOp : MainOp;
   }
 
   /// Checks if main/alt instructions are shift operations.
@@ -1237,9 +1285,63 @@ public:
   explicit operator bool() const { return valid(); }
 
   InstructionsState() = delete;
-  InstructionsState(Instruction *MainOp, Instruction *AltOp)
-      : MainOp(MainOp), AltOp(AltOp) {}
+  InstructionsState(Instruction *MainOp, Instruction *AltOp,
+                    bool HasCopyables = false)
+      : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
   static InstructionsState invalid() { return {nullptr, nullptr}; }
+
+  bool isCopyableElement(Value *V) const {
+    assert(valid() && "InstructionsState is invalid.");
+    if (!HasCopyables)
+      return false;
+    if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
+      return false;
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      return !isa<PoisonValue>(V);
+    if (I->getParent() != MainOp->getParent() &&
+        (!isVectorLikeInstWithConstOps(I) ||
+         !isVectorLikeInstWithConstOps(MainOp)))
+      return true;
+    if (I->getOpcode() == MainOp->getOpcode())
+      return false;
+    if (!I->isBinaryOp())
+      return true;
+    BinOpSameOpcodeHelper Converter(MainOp);
+    return !Converter.add(I) || !Converter.add(MainOp) ||
+           Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode());
+  }
+
+  /// Checks if the value is non-schedulable.
+  bool isNonSchedulable(Value *V) const {
+    assert(valid() && "InstructionsState is invalid.");
+    auto *I = dyn_cast<Instruction>(V);
+    if (!HasCopyables)
+      return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+             doesNotNeedToBeScheduled(V);
+    // MainOp for copyables always schedulable to correctly identify
+    // non-schedulable copyables.
+    if (isCopyableElement(V)) {
+      auto IsNonSchedulableCopyableElement = [this](Value *V) {
+        auto *I = dyn_cast<Instruction>(V);
+        return !I || isa<PHINode>(I) || I->getParent() != MainOp->getParent() ||
+               (doesNotNeedToBeScheduled(I) &&
+                // If the copyable instructions comes after MainOp
+                // (non-schedulable, but used in the block) - cannot vectorize
+                // it, will possibly generate use before def.
+                (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I)));
+      };
+
+      return IsNonSchedulableCopyableElement(V);
+    }
+    return !I || isa<PHINode>(I) || isVectorLikeInstWithConstOps(I) ||
+           doesNotNeedToBeScheduled(V);
+  }
+
+  bool areInstructionsWithCopyableElements() const {
+    assert(valid() && "InstructionsState is invalid.");
+    return HasCopyables;
+  }
 };
 
 std::pair<Instruction *, SmallVector<Value *>>
@@ -1917,6 +2019,7 @@ public:
     CompressEntryToData.clear();
     ExternalUses.clear();
     ExternalUsesAsOriginalScalar.clear();
+    ExternalUsesWithNonUsers.clear();
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
@@ -2899,9 +3002,6 @@ public:
       for (OperandDataVec &Ops : OpsVec)
         Ops.resize(NumLanes);
       for (unsigned Lane : seq<unsigned>(NumLanes)) {
-        Value *V = VL[Lane];
-        assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
-               "Expected instruction or poison value");
         // Our tree has just 3 nodes: the root and two operands.
         // It is therefore trivial to get the APO. We only need to check the
         // opcode of V and whether the operand at OpIdx is the LHS or RHS
@@ -2912,17 +3012,24 @@ public:
         // Since operand reordering is performed on groups of commutative
         // operations or alternating sequences (e.g., +, -), we can safely tell
         // the inverse operations by checking commutativity.
-        if (isa<PoisonValue>(V)) {
+        auto *I = dyn_cast<Instruction>(VL[Lane]);
+        if (!I && isa<PoisonValue>(VL[Lane])) {
           for (unsigned OpIdx : seq<unsigned>(NumOperands))
             OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
           continue;
         }
-        auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
-        // We cannot check commutativity by the converted instruction
-        // (SelectedOp) because isCommutative also examines def-use
-        // relationships.
-        bool IsInverseOperation =
-            !isCommutative(SelectedOp, cast<Instruction>(V));
+        bool IsInverseOperation = false;
+        if (S.isCopyableElement(VL[Lane])) {
+          // The value is a copyable element.
+          IsInverseOperation = !isCommutative(MainOp);
+        } else {
+          assert(I && "Expected instruction");
+          auto [SelectedOp, Ops] = convertTo(I, S);
+          // We cannot check commutativity by the converted instruction
+          // (SelectedOp) because isCommutative also examines def-use
+          // relationships.
+          IsInverseOperation = !isCommutative(SelectedOp, I);
+        }
         for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
           OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
@@ -3792,6 +3899,9 @@ private:
     /// reordering of operands during buildTreeRec() and vectorizeTree().
     SmallVector<ValueList, 2> Operands;
 
+    /// Copyable elements of the entry node.
+    SmallPtrSet<const Value *, 4> CopyableElements;
+
     /// MainOp and AltOp are recorded inside. S should be obtained from
     /// newTreeEntry.
     InstructionsState S = InstructionsState::invalid();
@@ -3820,11 +3930,7 @@ private:
     void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
 
     /// Marks the node as one that does not require scheduling.
-    void setDoesNotNeedToSchedule() {
-      assert(::doesNotNeedToSchedule(Scalars) &&
-             "Expected to not need scheduling");
-      DoesNotNeedToSchedule = true;
-    }
+    void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
     /// Returns true if the node is marked as one that does not require
     /// scheduling.
     bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
@@ -3896,6 +4002,20 @@ private:
 
     bool hasState() const { return S.valid(); }
 
+    /// Add \p V to the list of copyable elements.
+    void addCopyableElement(Value *V) {
+      assert(S.isCopyableElement(V) && "Not a copyable element.");
+      CopyableElements.insert(V);
+    }
+
+    /// Returns true if \p V is a copyable element.
+    bool isCopyableElement(Value *V) const {
+      return CopyableElements.contains(V);
+    }
+
+    /// Returns true if any scalar in the list is a copyable element.
+    bool hasCopyableElements() const { return !CopyableElements.empty(); }
+
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -3968,6 +4088,8 @@ private:
       for (Value *V : Scalars)
         dbgs().indent(2) << *V << "\n";
       dbgs() << "State: ";
+      if (S && hasCopyableElements())
+        dbgs() << "[[Copyable]] ";
       switch (State) {
       case Vectorize:
         if (InterleaveFactor > 0) {
@@ -4145,12 +4267,20 @@ private:
         }
       }
     } else if (!Last->isGather()) {
-      if (doesNotNeedToSchedule(VL))
+      if (isa<PHINode>(S.getMainOp()) ||
+          isVectorLikeInstWithConstOps(S.getMainOp()) ||
+          (!S.areInstructionsWithCopyableElements() &&
+           doesNotNeedToSchedule(VL)) ||
+          all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
         Last->setDoesNotNeedToSchedule();
       SmallPtrSet<Value *, 4> Processed;
       for (Value *V : VL) {
         if (isa<PoisonValue>(V))
           continue;
+        if (S.isCopyableElement(V)) {
+          Last->addCopyableElement(V);
+          continue;
+        }
         auto It = ScalarToTreeEntries.find(V);
         if (It == ScalarToTreeEntries.end()) {
           ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last);
@@ -4162,16 +4292,14 @@ private:
         }
       }
       // Update the scheduler bundle to point to this TreeEntry.
-      assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
-              isVectorLikeInstWithConstOps(S.getMainOp()) ||
-              Last->doesNotNeedToSchedule()) &&
+      assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
              "Bundle and VL out of sync");
       if (!Bundle.getBundle().empty()) {
 #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
         auto *BundleMember = Bundle.getBundle().begin();
         SmallPtrSet<Value *, 4> Processed;
         for (Value *V : VL) {
-          if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second)
+          if (S.isNonSchedulable(V) || !Processed.insert(V).second)
             continue;
           ++BundleMember;
         }
@@ -4280,7 +4408,8 @@ private:
   /// in general.
   ScalarsVectorizationLegality
   getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
-                                  const EdgeInfo &UserTreeIdx) const;
+                                  const EdgeInfo &UserTreeIdx,
+                                  bool TryCopyableElementsVectorization) const;
 
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
@@ -4420,6 +4549,10 @@ private:
   /// extractelement instructions.
   SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
 
+  /// A list of scalar to be extracted without specific user necause of too many
+  /// uses.
+  SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -4996,7 +5129,8 @@ private:
 
     /// Build a bundle from the ScheduleData nodes corresponding to the
     /// scalar instruction for each lane.
-    ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
+    ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
+                                const InstructionsState &S);
 
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
@@ -6727,7 +6861,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
     return std::move(ResOrder);
   }
   if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
-      (!TE.UserTreeIndex ||
+      (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
        !Instruction::isBinaryOp(TE.UserTreeIndex.UserTE->getOpcode())) &&
       (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
     return std::nullopt;
@@ -7038,10 +7172,11 @@ bool BoUpSLP::isProfitableToReorder() const {
          VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
       VectorizableTree.front()->ReorderIndices.empty()) {
     // Check if the tree has only single store and single (unordered) load node,
-    // other nodes are phis or geps/binops, combined with phis, and/orsingle
+    // other nodes are phis or geps/binops, combined with phis, and/or single
     // gather load node
     bool HasPhis = false;
-    if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+    if (VectorizableTree.front()->hasState() &&
+        VectorizableTree.front()->getOpcode() == Instruction::PHI &&
         VectorizableTree.front()->Scalars.size() == TinyVF &&
         VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
       return false;
@@ -7049,6 +7184,8 @@ bool BoUpSLP::isProfitableToReorder() const {
     unsigned GatherLoads = 0;
     for (const std::unique_ptr<TreeEntry> &TE :
          ArrayRef(VectorizableTree).drop_front()) {
+      if (TE->State == TreeEntry::SplitVectorize)
+        continue;
       if (!TE->hasState()) {
         if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
             all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
@@ -7072,7 +7209,10 @@ bool BoUpSLP::isProfitableToReorder() const {
       if (TE->getOpcode() == Instruction::GetElementPtr ||
           Instruction::isBinaryOp(TE->getOpcode()))
         continue;
-      if (TE->getOpcode() != Instruction::PHI)
+      if (TE->getOpcode() != Instruction::PHI &&
+          (!TE->hasCopyableElements() ||
+           static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
+               TE->Scalars.size() / 2))
         return true;
       if (VectorizableTree.front()->Scalars.size() == TinyVF &&
           TE->getNumOperands() > PhiOpsLimit)
@@ -7860,7 +8000,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
 }
 
 Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
-  if ((Entry.getOpcode() == Instruction::Store ||
+  if (Entry.hasState() &&
+      (Entry.getOpcode() == Instruction::Store ||
        Entry.getOpcode() == Instruction::Load) &&
       Entry.State == TreeEntry::StridedVectorize &&
       !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
@@ -7870,7 +8011,9 @@ Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
 
 void BoUpSLP::buildExternalUses(
     const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+  const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
   DenseMap<Value *, unsigned> ScalarToExtUses;
+  SmallPtrSet<Value *, 4> ExternalUsers;
   // Collect the values that we need to extract from the tree.
   for (auto &TEPtr : VectorizableTree) {
     TreeEntry *Entry = TEPtr.get();
@@ -7882,13 +8025,24 @@ void BoUpSLP::buildExternalUses(
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
-      if (!isa<Instruction>(Scalar))
+      if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
         continue;
+
       // All uses must be replaced already? No need to do it again.
       auto It = ScalarToExtUses.find(Scalar);
       if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
         continue;
 
+      if (Scalar->hasNUsesOrMore(NumVectScalars)) {
+        unsigned FoundLane = Entry->findLaneForValue(Scalar);
+        LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
+                          << " from " << *Scalar << "for many users.\n");
+        It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
+        ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
+        ExternalUsesWithNonUsers.insert(Scalar);
+        continue;
+      }
+
       // Check if the scalar is externally used as an extra arg.
       const auto ExtI = ExternallyUsedValues.find(Scalar);
       if (ExtI != ExternallyUsedValues.end()) {
@@ -7916,7 +8070,10 @@ void BoUpSLP::buildExternalUses(
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in FoundLane will
           // be used.
-          if (all_of(UseEntries, [&](TreeEntry *UseEntry) {
+          if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
+                 isa<LoadInst, StoreInst>(UserInst)) ||
+                isa<CallInst>(UserInst)) ||
+              all_of(UseEntries, [&](TreeEntry *UseEntry) {
                 return UseEntry->State == TreeEntry::ScatterVectorize ||
                        !doesInTreeUserNeedToExtract(
                            Scalar, getRootEntryInstruction(*UseEntry), TLI,
@@ -7946,6 +8103,7 @@ void BoUpSLP::buildExternalUses(
                           << ".\n");
         It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
         ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
+        ExternalUsesWithNonUsers.insert(Scalar);
         if (!U)
           break;
       }
@@ -9612,7 +9770,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
             PoisonValue::get(UniqueValues.front()->getType()));
         // Check that extended with poisons operations are still valid for
         // vectorization (div/rem are not allowed).
-        if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+        if (!S.areInstructionsWithCopyableElements() &&
+            !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
           LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
           ReuseShuffleIndices.clear();
           return false;
@@ -9761,13 +9920,95 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
 }
 
 namespace {
-/// Class accepts incoming list of values and generates the list of values
-/// for scheduling and list of operands for the new nodes.
+/// Class accepts incoming list of values, checks if it is able to model
+/// "copyable" values as compatible operations, and generates the list of values
+/// for scheduling and list of operands doe the new nodes.
 class InstructionsCompatibilityAnalysis {
   DominatorTree &DT;
   const DataLayout &DL;
   const TargetTransformInfo &TTI;
   const TargetLibraryInfo &TLI;
+  unsigned MainOpcode = 0;
+  Instruction *MainOp = nullptr;
+
+  /// Identifies the best candidate value, which represents main opcode
+  /// operation.
+  /// Currently the best candidate is the Add instruction with the parent
+  /// block with the highest DFS incoming number (block, that dominates other).
+  void findAndSetMainInstruction(ArrayRef<Value *> VL) {
+    BasicBlock *Parent = nullptr;
+    // Checks if the instruction has supported opcode.
+    auto IsSupportedOpcode = [](Instruction *I) {
+      return I && I->getOpcode() == Instruction::Add;
+    };
+    SmallDenseSet<Value *, 8> Operands;
+    for (Value *V : VL) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      if (!DT.isReachableFromEntry(I->getParent()))
+        continue;
+      if (!MainOp) {
+        MainOp = I;
+        Parent = I->getParent();
+        Operands.insert(I->op_begin(), I->op_end());
+        continue;
+      }
+      if (Parent == I->getParent()) {
+        if (!IsSupportedOpcode(MainOp))
+          MainOp = I;
+        if (MainOp->getOpcode() == I->getOpcode() &&
+            doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I))
+          MainOp = I;
+        Operands.insert(I->op_begin(), I->op_end());
+        continue;
+      }
+      auto *NodeA = DT.getNode(Parent);
+      auto *NodeB = DT.getNode(I->getParent());
+      assert(NodeA && "Should only process reachable instructions");
+      assert(NodeB && "Should only process reachable instructions");
+      assert((NodeA == NodeB) ==
+                 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+             "Different nodes should have different DFS numbers");
+      if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
+        MainOp = I;
+        Parent = I->getParent();
+        Operands.clear();
+        Operands.insert(I->op_begin(), I->op_end());
+      }
+    }
+    if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
+      MainOp = nullptr;
+      return;
+    }
+    MainOpcode = MainOp->getOpcode();
+  }
+
+  /// Returns the idempotent value for the \p MainOp with the detected \p
+  /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
+  /// the operand itself, since V or V == V.
+  Value *selectBestIdempotentValue() const {
+    assert(MainOpcode == Instruction::Add && "Unsupported opcode");
+    return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
+                                          !MainOp->isCommutative());
+  }
+
+  /// Returns the value and operands for the \p V, considering if it is original
+  /// instruction and its actual operands should be returned, or it is a
+  /// copyable element and its should be represented as idempotent instruction.
+  SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
+    if (isa<PoisonValue>(V))
+      return {V, V};
+    if (!S.isCopyableElement(V))
+      return convertTo(cast<Instruction>(V), S).second;
+    switch (MainOpcode) {
+    case Instruction::Add:
+      return {V, selectBestIdempotentValue()};
+    default:
+      break;
+    }
+    llvm_unreachable("Unsupported opcode");
+  }
 
   /// Builds operands for the original instructions.
   void
@@ -9928,22 +10169,165 @@ public:
                                     const TargetLibraryInfo &TLI)
       : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
 
+  InstructionsState
+  buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
+                         bool TryCopyableElementsVectorization,
+                         bool WithProfitabilityCheck = false,
+                         bool SkipSameCodeCheck = false) {
+    InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
+                              ? InstructionsState::invalid()
+                              : getSameOpcode(VL, TLI);
+    if (S)
+      return S;
+    if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
+      return S;
+    findAndSetMainInstruction(VL);
+    if (!MainOp)
+      return InstructionsState::invalid();
+    S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
+    // TODO: Remove this check once support for schulable copyables is landed.
+    if (any_of(VL, [&](Value *V) {
+          return S.isCopyableElement(V) && !S.isNonSchedulable(V);
+        }))
+      return InstructionsState::invalid();
+
+    if (!WithProfitabilityCheck)
+      return S;
+    // Check if it is profitable to vectorize the instruction.
+    SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
+    auto BuildCandidates =
+        [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
+           Value *V2) {
+          if (V1 != V2 && isa<PHINode>(V1))
+            return;
+          auto *I1 = dyn_cast<Instruction>(V1);
+          auto *I2 = dyn_cast<Instruction>(V2);
+          if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
+              I1->getParent() != I2->getParent())
+            return;
+          Candidates.emplace_back(V1, (I1 || I2) ? V2 : V1);
+        };
+    if (VL.size() == 2) {
+      // Check if the operands allow better vectorization.
+      SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
+      BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
+      BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
+      bool Res = !Candidates1.empty() && !Candidates2.empty() &&
+                 R.findBestRootPair(Candidates1) &&
+                 R.findBestRootPair(Candidates2);
+      if (!Res && isCommutative(MainOp)) {
+        Candidates1.clear();
+        Candidates2.clear();
+        BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
+        BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
+        Res = !Candidates1.empty() && !Candidates2.empty() &&
+              R.findBestRootPair(Candidates1) &&
+              R.findBestRootPair(Candidates2);
+      }
+      if (!Res)
+        return InstructionsState::invalid();
+      return S;
+    }
+    assert(Operands.size() == 2 && "Unexpected number of operands!");
+    unsigned CopyableNum =
+        count_if(VL, [&](Value *V) { return S.isCopyableElement(V); });
+    if (CopyableNum < VL.size() / 2)
+      return S;
+    // Too many phi copyables - exit.
+    const unsigned Limit = VL.size() / 24;
+    if ((CopyableNum >= VL.size() - Limit ||
+         (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
+         CopyableNum >= MaxPHINumOperands) &&
+        all_of(VL, [&](Value *V) {
+          return isa<PHINode>(V) || !S.isCopyableElement(V);
+        }))
+      return InstructionsState::invalid();
+    // Check profitability if number of copyables > VL.size() / 2.
+    // 1. Reorder operands for better matching.
+    if (isCommutative(MainOp)) {
+      for (auto &Ops : Operands) {
+        // Make instructions the first operands.
+        if (!isa<Instruction>(Ops.front()) && isa<Instruction>(Ops.back())) {
+          std::swap(Ops.front(), Ops.back());
+          continue;
+        }
+        // Make constants the second operands.
+        if (isa<Constant>(Ops.front())) {
+          std::swap(Ops.front(), Ops.back());
+          continue;
+        }
+      }
+    }
+    // 2. Check, if operands can be vectorized.
+    if (count_if(Operands.back(), IsaPred<Instruction>) > 1)
+      return InstructionsState::invalid();
+    auto CheckOperand = [&](ArrayRef<Value *> Ops) {
+      if (allConstant(Ops) || isSplat(Ops))
+        return true;
+      // Check if it is "almost" splat, i.e. has >= 4 elements and only single
+      // one is different.
+      constexpr unsigned Limit = 4;
+      if (Operands.front().size() >= Limit) {
+        SmallDenseMap<const Value *, unsigned> Counters;
+        for (Value *V : Ops) {
+          if (isa<UndefValue>(V))
+            continue;
+          ++Counters[V];
+        }
+        if (Counters.size() == 2 &&
+            any_of(Counters, [&](const std::pair<const Value *, unsigned> &C) {
+              return C.second == 1;
+            }))
+          return true;
+      }
+      // First operand not a constant or splat? Last attempt - check for
+      // potential vectorization.
+      InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+      InstructionsState OpS = Analysis.buildInstructionsState(
+          Ops, R, /*TryCopyableElementsVectorization=*/true);
+      if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(Ops)))
+        return false;
+      unsigned CopyableNum =
+          count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); });
+      return CopyableNum <= VL.size() / 2;
+    };
+    if (!CheckOperand(Operands.front()))
+      return InstructionsState::invalid();
+
+    return S;
+  }
+
   SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
                                                 ArrayRef<Value *> VL) {
     assert(S && "Invalid state!");
     SmallVector<BoUpSLP::ValueList> Operands;
-    buildOriginalOperands(S, VL, Operands);
+    if (S.areInstructionsWithCopyableElements()) {
+      MainOp = S.getMainOp();
+      MainOpcode = S.getOpcode();
+      Operands.assign(MainOp->getNumOperands(),
+                      BoUpSLP::ValueList(VL.size(), nullptr));
+      for (auto [Idx, V] : enumerate(VL)) {
+        SmallVector<Value *> OperandsForValue = getOperands(S, V);
+        for (auto [OperandIdx, Operand] : enumerate(OperandsForValue))
+          Operands[OperandIdx][Idx] = Operand;
+      }
+    } else {
+      buildOriginalOperands(S, VL, Operands);
+    }
     return Operands;
   }
 };
 } // namespace
 
-BoUpSLP::ScalarsVectorizationLegality
-BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
-                                         const EdgeInfo &UserTreeIdx) const {
+BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
+    ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
+    bool TryCopyableElementsVectorization) const {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
-  InstructionsState S = getSameOpcode(VL, *TLI);
+  InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+  InstructionsState S = Analysis.buildInstructionsState(
+      VL, *this, TryCopyableElementsVectorization,
+      /*WithProfitabilityCheck=*/true, TryCopyableElementsVectorization);
 
   // Don't go into catchswitch blocks, which can happen with PHIs.
   // Such blocks can only have PHIs and the catchswitch.  There is no
@@ -10066,7 +10450,7 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
   bool IsScatterVectorizeUserTE =
       UserTreeIdx.UserTE &&
       UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
-  bool AreAllSameBlock = S && allSameBlock(VL);
+  bool AreAllSameBlock = S.valid();
   bool AreScatterAllGEPSameBlock =
       (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
        VL.size() > 2 &&
@@ -10091,12 +10475,18 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
       NotProfitableForVectorization(VL)) {
     if (!S) {
       LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
-                           "C,S,B,O, small shuffle. \n");
+                           "C,S,B,O, small shuffle. \n";
+                 dbgs() << "[";
+                 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
+                 dbgs() << "]\n");
       return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
                                           /*TryToFindDuplicates=*/true,
                                           /*TrySplitVectorize=*/true);
     }
-    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
+               dbgs() << "[";
+               interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
+               dbgs() << "]\n");
     return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
   }
 
@@ -10242,9 +10632,29 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
     return true;
   };
 
-  ScalarsVectorizationLegality Legality =
-      getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
-  const InstructionsState &S = Legality.getInstructionsState();
+  auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
+    bool AreConsts = false;
+    for (Value *V : VL) {
+      if (isa<PoisonValue>(V))
+        continue;
+      if (isa<Constant>(V)) {
+        AreConsts = true;
+        continue;
+      }
+      if (!isa<PHINode>(V))
+        return false;
+    }
+    return AreConsts;
+  };
+  if (AreOnlyConstsWithPHIs(VL)) {
+    LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
+    newGatherTreeEntry(VL, InstructionsState::invalid(), UserTreeIdx);
+    return;
+  }
+
+  ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
+      VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
+  InstructionsState S = Legality.getInstructionsState();
   if (!Legality.isLegal()) {
     if (Legality.trySplitVectorize()) {
       auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
@@ -10252,11 +10662,18 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
         return;
     }
-    if (Legality.tryToFindDuplicates())
-      tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx);
+    if (!S)
+      Legality = getScalarsVectorizationLegality(
+          VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
+    if (!Legality.isLegal()) {
+      if (Legality.tryToFindDuplicates())
+        tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
+                            UserTreeIdx);
 
-    newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
-    return;
+      newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
+      return;
+    }
+    S = Legality.getInstructionsState();
   }
 
   // FIXME: investigate if there are profitable cases for VL.size() <= 4.
@@ -13024,7 +13441,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   assert(E->getOpcode() &&
          ((allSameType(VL) && allSameBlock(VL)) ||
           (E->getOpcode() == Instruction::GetElementPtr &&
-           E->getMainOp()->getType()->isPointerTy())) &&
+           E->getMainOp()->getType()->isPointerTy()) ||
+          E->hasCopyableElements()) &&
          "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
@@ -13036,6 +13454,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   SmallBitVector UsedScalars(Sz, false);
   for (unsigned I = 0; I < Sz; ++I) {
     if (isa<Instruction>(UniqueValues[I]) &&
+        !E->isCopyableElement(UniqueValues[I]) &&
         getTreeEntries(UniqueValues[I]).front() == E)
       continue;
     UsedScalars.set(I);
@@ -14075,15 +14494,45 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
 
   // If the tree contains only phis, buildvectors, split nodes and
   // small nodes with reuses, we can skip it.
+  SmallVector<const TreeEntry *> StoreLoadNodes;
+  unsigned NumGathers = 0;
+  constexpr int LimitTreeSize = 36;
   if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
-      all_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
-        return TE->State == TreeEntry::SplitVectorize ||
-               (TE->isGather() &&
-                none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
-               (TE->hasState() && (TE->getOpcode() == Instruction::PHI ||
-                                   (!TE->ReuseShuffleIndices.empty() &&
-                                    TE->Scalars.size() == 2)));
-      }))
+      all_of(VectorizableTree,
+             [&](const std::unique_ptr<TreeEntry> &TE) {
+               if (!TE->isGather() && TE->hasState() &&
+                   (TE->getOpcode() == Instruction::Load ||
+                    TE->getOpcode() == Instruction::Store)) {
+                 StoreLoadNodes.push_back(TE.get());
+                 return true;
+               }
+               if (TE->isGather())
+                 ++NumGathers;
+               return TE->State == TreeEntry::SplitVectorize ||
+                      (TE->Idx == 0 && TE->Scalars.size() == 2 &&
+                       TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
+                       VectorizableTree.size() > LimitTreeSize) ||
+                      (TE->isGather() &&
+                       none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
+                      (TE->hasState() &&
+                       (TE->getOpcode() == Instruction::PHI ||
+                        (TE->hasCopyableElements() &&
+                         static_cast<unsigned>(count_if(
+                             TE->Scalars, IsaPred<PHINode, Constant>)) >=
+                             TE->Scalars.size() / 2) ||
+                        ((!TE->ReuseShuffleIndices.empty() ||
+                          !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
+                         TE->Scalars.size() == 2)));
+             }) &&
+      (StoreLoadNodes.empty() ||
+       (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
+        (NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
+           return TE->getOpcode() == Instruction::Store ||
+                  all_of(TE->Scalars, [&](Value *V) {
+                    return !isa<LoadInst>(V) ||
+                           areAllUsersVectorized(cast<Instruction>(V));
+                  });
+         })))))
     return true;
 
   // We can vectorize the tree if its size is greater than or equal to the
@@ -14826,6 +15275,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
         bool IsProfitablePHIUser =
             (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
                             VectorizableTree.front()->Scalars.size() > 2)) &&
+            VectorizableTree.front()->hasState() &&
             VectorizableTree.front()->getOpcode() == Instruction::PHI &&
             !Inst->hasNUsesOrMore(UsesLimit) &&
             none_of(Inst->users(),
@@ -15276,7 +15726,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   const BasicBlock *TEInsertBlock = nullptr;
   // Main node of PHI entries keeps the correct order of operands/incoming
   // blocks.
-  if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp());
+  if (auto *PHI = dyn_cast_or_null<PHINode>(
+          TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
       PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
     TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
     TEInsertPt = TEInsertBlock->getTerminator();
@@ -15375,7 +15826,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
              "Expected only single user of a gather node.");
       const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
 
-      PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
+      PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
+                          UseEI.UserTE->hasState())
                              ? dyn_cast<PHINode>(UseEI.UserTE->getMainOp())
                              : nullptr;
       Instruction *InsertPt =
@@ -15388,7 +15840,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
              TEUseEI.UserTE->isAltShuffle()) &&
             all_of(TEUseEI.UserTE->Scalars, isUsedOutsideBlock)) {
           if (UseEI.UserTE->State != TreeEntry::Vectorize ||
-              (UseEI.UserTE->getOpcode() == Instruction::PHI &&
+              (UseEI.UserTE->hasState() &&
+               UseEI.UserTE->getOpcode() == Instruction::PHI &&
                !UseEI.UserTE->isAltShuffle()) ||
               !all_of(UseEI.UserTE->Scalars, isUsedOutsideBlock))
             continue;
@@ -16009,25 +16462,32 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   Instruction *Res = nullptr;
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block (except for extractelement-like instructions with
-  // constant indices or gathered loads).
-  auto *Front = E->getMainOp();
+  // constant indices or gathered loads or copyables).
+  Instruction *Front;
+  unsigned Opcode;
+  if (E->hasState()) {
+    Front = E->getMainOp();
+    Opcode = E->getOpcode();
+  } else {
+    Front = cast<Instruction>(*find_if(E->Scalars, IsaPred<Instruction>));
+    Opcode = Front->getOpcode();
+  }
   auto *BB = Front->getParent();
-  assert(((GatheredLoadsEntriesFirst.has_value() &&
-           E->getOpcode() == Instruction::Load && E->isGather() &&
-           E->Idx < *GatheredLoadsEntriesFirst) ||
-          E->State == TreeEntry::SplitVectorize ||
-          all_of(E->Scalars,
-                 [=](Value *V) -> bool {
-                   if (E->getOpcode() == Instruction::GetElementPtr &&
-                       !isa<GetElementPtrInst>(V))
-                     return true;
-                   auto *I = dyn_cast<Instruction>(V);
-                   return !I || !E->getMatchingMainOpOrAltOp(I) ||
-                          I->getParent() == BB ||
-                          isVectorLikeInstWithConstOps(I);
-                 })) &&
-         "Expected gathered loads or GEPs or instructions from same basic "
-         "block.");
+  assert(
+      ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
+        E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
+       E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
+       all_of(E->Scalars,
+              [=](Value *V) -> bool {
+                if (Opcode == Instruction::GetElementPtr &&
+                    !isa<GetElementPtrInst>(V))
+                  return true;
+                auto *I = dyn_cast<Instruction>(V);
+                return !I || !E->getMatchingMainOpOrAltOp(I) ||
+                       I->getParent() == BB || isVectorLikeInstWithConstOps(I);
+              })) &&
+      "Expected gathered loads or GEPs or instructions from same basic "
+      "block.");
 
   auto FindLastInst = [&]() {
     Instruction *LastInst = Front;
@@ -16035,18 +16495,20 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
+      if (E->isCopyableElement(I))
+        continue;
       if (LastInst->getParent() == I->getParent()) {
         if (LastInst->comesBefore(I))
           LastInst = I;
         continue;
       }
-      assert(((E->getOpcode() == Instruction::GetElementPtr &&
+      assert(((Opcode == Instruction::GetElementPtr &&
                !isa<GetElementPtrInst>(I)) ||
               E->State == TreeEntry::SplitVectorize ||
               (isVectorLikeInstWithConstOps(LastInst) &&
                isVectorLikeInstWithConstOps(I)) ||
               (GatheredLoadsEntriesFirst.has_value() &&
-               E->getOpcode() == Instruction::Load && E->isGather() &&
+               Opcode == Instruction::Load && E->isGather() &&
                E->Idx < *GatheredLoadsEntriesFirst)) &&
              "Expected vector-like or non-GEP in GEP node insts only.");
       if (!DT->isReachableFromEntry(LastInst->getParent())) {
@@ -16075,16 +16537,18 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
       auto *I = dyn_cast<Instruction>(V);
       if (!I)
         continue;
+      if (E->isCopyableElement(I))
+        continue;
       if (FirstInst->getParent() == I->getParent()) {
         if (I->comesBefore(FirstInst))
           FirstInst = I;
         continue;
       }
-      assert(((E->getOpcode() == Instruction::GetElementPtr &&
-              !isa<GetElementPtrInst>(I)) ||
-             (isVectorLikeInstWithConstOps(FirstInst) &&
-              isVectorLikeInstWithConstOps(I))) &&
-                 "Expected vector-like or non-GEP in GEP node insts only.");
+      assert(((Opcode == Instruction::GetElementPtr &&
+               !isa<GetElementPtrInst>(I)) ||
+              (isVectorLikeInstWithConstOps(FirstInst) &&
+               isVectorLikeInstWithConstOps(I))) &&
+             "Expected vector-like or non-GEP in GEP node insts only.");
       if (!DT->isReachableFromEntry(FirstInst->getParent())) {
         FirstInst = I;
         continue;
@@ -16122,7 +16586,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   // Set insertpoint for gathered loads to the very first load.
   if (GatheredLoadsEntriesFirst.has_value() &&
       E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
-      E->getOpcode() == Instruction::Load) {
+      Opcode == Instruction::Load) {
     Res = FindFirstInst();
     EntryToLastInstruction.try_emplace(E, Res);
     return *Res;
@@ -16139,7 +16603,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
       return nullptr;
     for (Value *V : E->Scalars) {
       auto *I = dyn_cast<Instruction>(V);
-      if (!I || isa<PHINode>(I) || doesNotNeedToBeScheduled(I))
+      if (!I || isa<PHINode>(I) ||
+          (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I)))
         continue;
       ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(I);
       if (Bundles.empty())
@@ -16153,13 +16618,13 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   };
   const ScheduleBundle *Bundle = FindScheduleBundle(E);
   if (!E->isGather() && !Bundle) {
-    if ((E->getOpcode() == Instruction::GetElementPtr &&
+    if ((Opcode == Instruction::GetElementPtr &&
          any_of(E->Scalars,
                 [](Value *V) {
                   return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
                 })) ||
-        all_of(E->Scalars, [](Value *V) {
-          return isa<PoisonValue>(V) ||
+        all_of(E->Scalars, [&](Value *V) {
+          return isa<PoisonValue>(V) || E->isCopyableElement(V) ||
                  (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
         }))
       Res = FindLastInst();
@@ -18640,6 +19105,7 @@ Value *BoUpSLP::vectorizeTree(
         TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
         (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
          TE->UserTreeIndex.UserTE->isAltShuffle()) &&
+        !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
         all_of(TE->UserTreeIndex.UserTE->Scalars,
                [](Value *V) { return isUsedOutsideBlock(V); })) {
       Instruction &LastInst =
@@ -18903,7 +19369,7 @@ Value *BoUpSLP::vectorizeTree(
         continue;
       assert(
           (ExternallyUsedValues.count(Scalar) ||
-           Scalar->hasNUsesOrMore(UsesLimit) ||
+          ExternalUsesWithNonUsers.count(Scalar) ||
            ExternalUsesAsOriginalScalar.contains(Scalar) ||
            any_of(
                Scalar->users(),
@@ -19182,7 +19648,7 @@ Value *BoUpSLP::vectorizeTree(
       if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
           EE && IgnoredExtracts.contains(EE))
         continue;
-      if (isa<PoisonValue>(Scalar))
+      if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
         continue;
 #ifndef NDEBUG
       Type *Ty = Scalar->getType();
@@ -19424,12 +19890,15 @@ void BoUpSLP::optimizeGatherSequence() {
 }
 
 BoUpSLP::ScheduleBundle &
-BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL,
+                                      const InstructionsState &S) {
   auto &BundlePtr =
       ScheduledBundlesList.emplace_back(std::make_unique<ScheduleBundle>());
   for (Value *V : VL) {
     if (doesNotNeedToBeScheduled(V))
       continue;
+    if (S.isCopyableElement(V))
+      continue;
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember && "no ScheduleData for bundle member "
                            "(maybe not in same basic block)");
@@ -19450,10 +19919,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
+  bool HasCopyables = S.areInstructionsWithCopyableElements();
   if (isa<PHINode>(S.getMainOp()) ||
-      isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL))
+      isVectorLikeInstWithConstOps(S.getMainOp()) ||
+      (!HasCopyables && doesNotNeedToSchedule(VL)) ||
+      all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))
     return nullptr;
 
+  // TODO Remove once full support for copyables is landed.
+  assert(all_of(VL,
+                [&](Value *V) {
+                  return !S.isCopyableElement(V) || S.isNonSchedulable(V);
+                }) &&
+         "Copyable elements should not be schedulable");
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.getMainOp() << "\n");
@@ -19499,7 +19977,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
   for (Value *V : VL) {
-    if (doesNotNeedToBeScheduled(V))
+    if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
       continue;
     if (!extendSchedulingRegion(V, S)) {
       // If the scheduling region got new instructions at the lower end (or it
@@ -19516,7 +19994,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   bool ReSchedule = false;
   for (Value *V : VL) {
-    if (doesNotNeedToBeScheduled(V))
+    if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
       continue;
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
@@ -19541,7 +20019,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     ReSchedule = true;
   }
 
-  ScheduleBundle &Bundle = buildBundle(VL);
+  ScheduleBundle &Bundle = buildBundle(VL, S);
   TryScheduleBundleImpl(ReSchedule, Bundle);
   if (!Bundle.isReady()) {
     for (ScheduleData *BD : Bundle.getBundle()) {
@@ -19558,7 +20036,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
     }
     ScheduledBundlesList.pop_back();
     for (Value *V : VL) {
-      if (doesNotNeedToBeScheduled(V))
+      if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V))
         continue;
       ScheduledBundles.find(cast<Instruction>(V))->getSecond().pop_back();
     }
@@ -20187,7 +20665,7 @@ bool BoUpSLP::collectValuesToDemote(
   };
   if (E.isGather() || !Visited.insert(&E).second ||
       any_of(E.Scalars, [&](Value *V) {
-        return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
+        return !isa<Constant>(V) && all_of(V->users(), [&](User *U) {
           return isa<InsertElementInst>(U) && !isVectorized(U);
         });
       }))
@@ -20555,9 +21033,10 @@ void BoUpSLP::computeMinimumValueSizes() {
                     if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
                              SelectInst>(U) ||
                         isa<SIToFPInst, UIToFPInst>(U) ||
-                        !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
-                             SelectInst>(UserTE->getMainOp()) ||
-                        isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))
+                        (UserTE->hasState() &&
+                         (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
+                               SelectInst>(UserTE->getMainOp()) ||
+                          isa<SIToFPInst, UIToFPInst>(UserTE->getMainOp()))))
                       return true;
                     unsigned UserTESz = DL->getTypeSizeInBits(
                         UserTE->Scalars.front()->getType());
@@ -20653,7 +21132,12 @@ void BoUpSLP::computeMinimumValueSizes() {
       if (!IsKnownPositive)
         ++BitWidth1;
 
-      APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
+      auto *I = dyn_cast<Instruction>(Root);
+      if (!I) {
+        MaxBitWidth = std::max(BitWidth1, MaxBitWidth);
+        continue;
+      }
+      APInt Mask = DB->getDemandedBits(I);
       unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
       MaxBitWidth =
           std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
@@ -20802,6 +21286,7 @@ void BoUpSLP::computeMinimumValueSizes() {
           NodeIdx < VectorizableTree.size() &&
           VectorizableTree[NodeIdx]->UserTreeIndex &&
           VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
+          VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
           VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
               Instruction::Trunc &&
           !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
@@ -20982,7 +21467,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   for (Value *V : Chain)
     ValOps.insert(cast<StoreInst>(V)->getValueOperand());
   // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
-  InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
+  InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
+  InstructionsState S = Analysis.buildInstructionsState(
+      ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
   if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
     DenseSet<Value *> Stores(Chain.begin(), Chain.end());
     bool IsAllowedSize =
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 40a5565..25b9616 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -962,7 +962,11 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
     BackedgeTakenCount->setUnderlyingValue(TCMO);
   }
 
-  VectorTripCount.setUnderlyingValue(VectorTripCountV);
+  if (!VectorTripCount.getUnderlyingValue())
+    VectorTripCount.setUnderlyingValue(VectorTripCountV);
+  else
+    assert(VectorTripCount.getUnderlyingValue() == VectorTripCountV &&
+           "VectorTripCount set earlier must much VectorTripCountV");
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9b67b7d..a5de593 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -906,10 +906,10 @@ template <unsigned PartOpIdx> class LLVM_ABI_FOR_TEST VPUnrollPartAccessor {
 protected:
   /// Return the VPValue operand containing the unroll part or null if there is
   /// no such operand.
-  VPValue *getUnrollPartOperand(VPUser &U) const;
+  VPValue *getUnrollPartOperand(const VPUser &U) const;
 
   /// Return the unroll part.
-  unsigned getUnrollPart(VPUser &U) const;
+  unsigned getUnrollPart(const VPUser &U) const;
 };
 
 /// Helper to manage IR metadata for recipes. It filters out metadata that
@@ -1012,6 +1012,10 @@ public:
     ReductionStartVector,
     // Creates a step vector starting from 0 to VF with a step of 1.
     StepVector,
+    /// Extracts a single lane (first operand) from a set of vector operands.
+    /// The lane specifies an index into a vector formed by combining all vector
+    /// operands (all operands after the first one).
+    ExtractLane,
 
   };
 
@@ -1837,6 +1841,10 @@ public:
                                      getGEPNoWrapFlags(), getDebugLoc());
   }
 
+  /// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that
+  /// this is only accurate after the VPlan has been unrolled.
+  bool isFirstPart() const { return getUnrollPart(*this) == 0; }
+
   /// Return the cost of this VPHeaderPHIRecipe.
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override {
@@ -2304,14 +2312,15 @@ public:
   /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can
   /// be omitted (implied by passing an odd number of operands) in which case
   /// all other incoming values are merged into it.
-  VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
-      : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) {
+  VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands, DebugLoc DL)
+      : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, DL) {
     assert(Operands.size() > 0 && "Expected at least one operand!");
   }
 
   VPBlendRecipe *clone() override {
     SmallVector<VPValue *> Ops(operands());
-    return new VPBlendRecipe(cast<PHINode>(getUnderlyingValue()), Ops);
+    return new VPBlendRecipe(cast_or_null<PHINode>(getUnderlyingValue()), Ops,
+                             getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPBlendSC)
@@ -3486,7 +3495,7 @@ public:
 
   /// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that
   /// this is only accurate after the VPlan has been unrolled.
-  bool isPart0() { return getUnrollPart(*this) == 0; }
+  bool isPart0() const { return getUnrollPart(*this) == 0; }
 
   VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)
 
@@ -4056,6 +4065,10 @@ public:
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
 
+  LLVMContext &getContext() const {
+    return getScalarHeader()->getIRBasicBlock()->getContext();
+  }
+
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
   void setVF(ElementCount VF) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 3499e65..16072f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -110,6 +110,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
     return SetResultTyFromOp();
+  case VPInstruction::ExtractLane:
+    return inferScalarType(R->getOperand(1));
   case VPInstruction::FirstActiveLane:
     return Type::getIntNTy(Ctx, 64);
   case VPInstruction::ExtractLastElement:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 194874a..6c1f53b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -437,9 +437,12 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
 
   // We are about to replace the branch to exit the region. Remove the original
   // BranchOnCond, if there is any.
+  DebugLoc LatchDL = DL;
   if (!LatchVPBB->empty() &&
-      match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue())))
+      match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) {
+    LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
     LatchVPBB->getTerminator()->eraseFromParent();
+  }
 
   VPBuilder Builder(LatchVPBB);
   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
@@ -452,7 +455,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
 
   // Add the BranchOnCount VPInstruction to the latch.
   Builder.createNaryOp(VPInstruction::BranchOnCount,
-                       {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+                       {CanonicalIVIncrement, &Plan.getVectorTripCount()},
+                       LatchDL);
 }
 
 void VPlanTransforms::prepareForVectorization(
@@ -462,28 +466,27 @@ void VPlanTransforms::prepareForVectorization(
   VPDominatorTree VPDT;
   VPDT.recalculate(Plan);
 
-  VPBlockBase *HeaderVPB = Plan.getEntry()->getSingleSuccessor();
-  canonicalHeaderAndLatch(HeaderVPB, VPDT);
-  VPBlockBase *LatchVPB = HeaderVPB->getPredecessors()[1];
+  auto *HeaderVPBB = cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor());
+  canonicalHeaderAndLatch(HeaderVPBB, VPDT);
+  auto *LatchVPBB = cast<VPBasicBlock>(HeaderVPBB->getPredecessors()[1]);
 
   VPBasicBlock *VecPreheader = Plan.createVPBasicBlock("vector.ph");
   VPBlockUtils::insertBlockAfter(VecPreheader, Plan.getEntry());
 
   VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock("middle.block");
-  // The canonical LatchVPB has the header block as last successor. If it has
+  // The canonical LatchVPBB has the header block as last successor. If it has
   // another successor, this successor is an exit block - insert middle block on
   // its edge. Otherwise, add middle block as another successor retaining header
   // as last.
-  if (LatchVPB->getNumSuccessors() == 2) {
-    VPBlockBase *LatchExitVPB = LatchVPB->getSuccessors()[0];
-    VPBlockUtils::insertOnEdge(LatchVPB, LatchExitVPB, MiddleVPBB);
+  if (LatchVPBB->getNumSuccessors() == 2) {
+    VPBlockBase *LatchExitVPB = LatchVPBB->getSuccessors()[0];
+    VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, MiddleVPBB);
   } else {
-    VPBlockUtils::connectBlocks(LatchVPB, MiddleVPBB);
-    LatchVPB->swapSuccessors();
+    VPBlockUtils::connectBlocks(LatchVPBB, MiddleVPBB);
+    LatchVPBB->swapSuccessors();
   }
 
-  addCanonicalIVRecipes(Plan, cast<VPBasicBlock>(HeaderVPB),
-                        cast<VPBasicBlock>(LatchVPB), InductionTy, IVDL);
+  addCanonicalIVRecipes(Plan, HeaderVPBB, LatchVPBB, InductionTy, IVDL);
 
   [[maybe_unused]] bool HandledUncountableEarlyExit = false;
   // Disconnect all early exits from the loop leaving it with a single exit from
@@ -499,8 +502,7 @@ void VPlanTransforms::prepareForVectorization(
         assert(!HandledUncountableEarlyExit &&
                "can handle exactly one uncountable early exit");
         handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
-                                   cast<VPBasicBlock>(HeaderVPB),
-                                   cast<VPBasicBlock>(LatchVPB), Range);
+                                   HeaderVPBB, LatchVPBB, Range);
         HandledUncountableEarlyExit = true;
       } else {
         for (VPRecipeBase &R : EB->phis())
@@ -564,15 +566,15 @@ void VPlanTransforms::prepareForVectorization(
   // the corresponding compare because they may have ended up with different
   // line numbers and we want to avoid awkward line stepping while debugging.
   // E.g., if the compare has got a line number inside the loop.
-  DebugLoc LatchDL = TheLoop->getLoopLatch()->getTerminator()->getDebugLoc();
+  DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
   VPBuilder Builder(MiddleVPBB);
   VPValue *Cmp;
   if (!RequiresScalarEpilogueCheck)
-    Cmp = Plan.getOrAddLiveIn(ConstantInt::getFalse(
-        IntegerType::getInt1Ty(TripCount->getType()->getContext())));
+    Cmp = Plan.getOrAddLiveIn(
+        ConstantInt::getFalse(IntegerType::getInt1Ty(Plan.getContext())));
   else if (TailFolded)
-    Cmp = Plan.getOrAddLiveIn(ConstantInt::getTrue(
-        IntegerType::getInt1Ty(TripCount->getType()->getContext())));
+    Cmp = Plan.getOrAddLiveIn(
+        ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
   else
     Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
                              &Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -646,7 +648,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
                    .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
                                  Plan.getCanonicalIV()->getDebugLoc());
   if (AddBranchWeights) {
-    MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+    MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights =
         MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
     Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index f0cab79..3b3bbc3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -184,8 +184,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
   VPValue *Cond = SI->getOperand(0);
   VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
   MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
-  for (const auto &[Idx, Succ] :
-       enumerate(ArrayRef(Src->getSuccessors()).drop_front())) {
+  for (const auto &[Idx, Succ] : enumerate(drop_begin(Src->getSuccessors()))) {
     VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
     assert(!getEdgeMask(Src, Dst) && "Edge masks already created");
     //  Cases whose destination is the same as default are redundant and can
@@ -206,7 +205,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
     // cases with destination == Dst are taken. Join the conditions for each
     // case whose destination == Dst using an OR.
     VPValue *Mask = Conds[0];
-    for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
+    for (VPValue *V : drop_begin(Conds))
       Mask = Builder.createOr(Mask, V);
     if (SrcMask)
       Mask = Builder.createLogicalAnd(SrcMask, Mask);
@@ -252,8 +251,9 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
       }
       OperandsWithMask.push_back(EdgeMask);
     }
-    PHINode *IRPhi = cast<PHINode>(PhiR->getUnderlyingValue());
-    auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask);
+    PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
+    auto *Blend =
+        new VPBlendRecipe(IRPhi, OperandsWithMask, PhiR->getDebugLoc());
     Builder.insert(Blend);
     PhiR->replaceAllUsesWith(Blend);
     PhiR->eraseFromParent();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b2066ce..68e7c20 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -413,20 +413,21 @@ void VPSingleDefRecipe::dump() const { VPDef::dump(); }
 
 template <unsigned PartOpIdx>
 VPValue *
-VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
+VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
   if (U.getNumOperands() == PartOpIdx + 1)
     return U.getOperand(PartOpIdx);
   return nullptr;
 }
 
 template <unsigned PartOpIdx>
-unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
+unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
   if (auto *UnrollPartOp = getUnrollPartOperand(U))
     return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
   return 0;
 }
 
 namespace llvm {
+template class VPUnrollPartAccessor<1>;
 template class VPUnrollPartAccessor<2>;
 template class VPUnrollPartAccessor<3>;
 }
@@ -863,6 +864,31 @@ Value *VPInstruction::generate(VPTransformState &State) {
       Res = Builder.CreateOr(Res, State.get(Op));
     return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
   }
+  case VPInstruction::ExtractLane: {
+    Value *LaneToExtract = State.get(getOperand(0), true);
+    Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0));
+    Value *Res = nullptr;
+    Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF);
+
+    for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
+      Value *VectorStart =
+          Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1));
+      Value *VectorIdx = Idx == 1
+                             ? LaneToExtract
+                             : Builder.CreateSub(LaneToExtract, VectorStart);
+      Value *Ext = State.VF.isScalar()
+                       ? State.get(getOperand(Idx))
+                       : Builder.CreateExtractElement(
+                             State.get(getOperand(Idx)), VectorIdx);
+      if (Res) {
+        Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart);
+        Res = Builder.CreateSelect(Cmp, Ext, Res);
+      } else {
+        Res = Ext;
+      }
+    }
+    return Res;
+  }
   case VPInstruction::FirstActiveLane: {
     if (getNumOperands() == 1) {
       Value *Mask = State.get(getOperand(0));
@@ -921,7 +947,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
 
   switch (getOpcode()) {
-  case Instruction::ExtractElement: {
+  case Instruction::ExtractElement:
+  case VPInstruction::ExtractLane: {
     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
     return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -983,6 +1010,7 @@ bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractLastElement ||
          getOpcode() == VPInstruction::ExtractPenultimateElement ||
          getOpcode() == Instruction::ExtractElement ||
+         getOpcode() == VPInstruction::ExtractLane ||
          getOpcode() == VPInstruction::FirstActiveLane ||
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ComputeFindIVResult ||
@@ -1048,6 +1076,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
+  case VPInstruction::ExtractLane:
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::FirstActiveLane:
@@ -1097,6 +1126,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::ComputeAnyOfResult:
   case VPInstruction::ComputeFindIVResult:
     return Op == getOperand(1);
+  case VPInstruction::ExtractLane:
+    return Op == getOperand(0);
   };
   llvm_unreachable("switch should return");
 }
@@ -1176,6 +1207,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::BuildVector:
     O << "buildvector";
     break;
+  case VPInstruction::ExtractLane:
+    O << "extract-lane";
+    break;
   case VPInstruction::ExtractLastElement:
     O << "extract-last-element";
     break;
@@ -3357,12 +3391,7 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
     assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
-    VectorType *InterleaveTy =
-        VectorType::get(VecTy->getElementType(),
-                        VecTy->getElementCount().multiplyCoefficientBy(Factor));
-    return Builder.CreateIntrinsic(InterleaveTy,
-                                   getInterleaveIntrinsicID(Factor), Vals,
-                                   /*FMFSource=*/nullptr, Name);
+    return Builder.CreateVectorInterleave(Vals, Name);
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -3469,8 +3498,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
       assert(InterleaveFactor <= 8 &&
              "Unsupported deinterleave factor for scalable vectors");
       NewLoad = State.Builder.CreateIntrinsic(
-          getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
-          NewLoad,
+          Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+          NewLoad->getType(), NewLoad,
           /*FMFSource=*/nullptr, "strided.vec");
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3372bcc..8de05c1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -774,10 +774,10 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   using namespace VPlanPatternMatch;
 
   VPValue *Incoming, *Mask;
-  if (!match(Op, m_VPInstruction<Instruction::ExtractElement>(
-                     m_VPValue(Incoming),
+  if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
                      m_VPInstruction<VPInstruction::FirstActiveLane>(
-                         m_VPValue(Mask)))))
+                         m_VPValue(Mask)),
+                     m_VPValue(Incoming))))
     return nullptr;
 
   auto *WideIV = getOptimizableIVOf(Incoming);
@@ -1094,6 +1094,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1))))
     return Def->replaceAllUsesWith(A);
 
+  if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0))))
+    return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1)
+                                                        : R.getOperand(0));
+
   if (match(Def, m_Not(m_VPValue(A)))) {
     if (match(A, m_Not(m_VPValue(A))))
       return Def->replaceAllUsesWith(A);
@@ -1172,6 +1176,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   if (!Plan->isUnrolled())
     return;
 
+  // VPVectorPointer for part 0 can be replaced by their start pointer.
+  if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(&R)) {
+    if (VecPtr->isFirstPart()) {
+      VecPtr->replaceAllUsesWith(VecPtr->getOperand(0));
+      return;
+    }
+  }
+
   // VPScalarIVSteps for part 0 can be replaced by their start value, if only
   // the first lane is demanded.
   if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
@@ -1307,8 +1319,9 @@ static void simplifyBlends(VPlan &Plan) {
         OperandsWithMask.push_back(Blend->getMask(I));
       }
 
-      auto *NewBlend = new VPBlendRecipe(
-          cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask);
+      auto *NewBlend =
+          new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
+                            OperandsWithMask, Blend->getDebugLoc());
       NewBlend->insertBefore(&R);
 
       VPValue *DeadMask = Blend->getMask(StartIndex);
@@ -1361,7 +1374,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
   unsigned NewBitWidth =
       ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
 
-  LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+  LLVMContext &Ctx = Plan.getContext();
   auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
 
   bool MadeChange = false;
@@ -1883,9 +1896,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
   }
 }
 
-/// Remove BranchOnCond recipes with true or false conditions together with
-/// removing dead edges to their successors.
-static void removeBranchOnConst(VPlan &Plan) {
+void VPlanTransforms::removeBranchOnConst(VPlan &Plan) {
   using namespace llvm::VPlanPatternMatch;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getEntry()))) {
@@ -1908,12 +1919,9 @@ static void removeBranchOnConst(VPlan &Plan) {
            "There must be a single edge between VPBB and its successor");
     // Values coming from VPBB into phi recipes of RemoveSucc are removed from
     // these recipes.
-    for (VPRecipeBase &R : RemovedSucc->phis()) {
-      auto *Phi = cast<VPPhiAccessors>(&R);
-      assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
-             "VPIRPhis must have a single predecessor");
-      Phi->removeIncomingValueFor(VPBB);
-    }
+    for (VPRecipeBase &R : RemovedSucc->phis())
+      cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
+
     // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
     // automatically on VPlan destruction if it becomes unreachable.
     VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
@@ -2515,8 +2523,8 @@ void VPlanTransforms::createInterleaveGroups(
                    DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
                        IG->getIndex(IRInsertPos),
                    /*IsSigned=*/true);
-      VPValue *OffsetVPV = Plan.getOrAddLiveIn(
-          ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
+      VPValue *OffsetVPV =
+          Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
       VPBuilder B(InsertPos);
       Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
                       : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
@@ -2842,7 +2850,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
           VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
           "first.active.lane");
       IncomingFromEarlyExit = EarlyExitB.createNaryOp(
-          Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
+          VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
           nullptr, "early.exit.value");
       ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
     }
@@ -3093,6 +3101,29 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   }
 }
 
+void VPlanTransforms::materializeVectorTripCount(
+    VPlan &Plan, ElementCount BestVF, unsigned BestUF,
+    PredicatedScalarEvolution &PSE) {
+  assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
+  assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
+
+  VPValue *TC = Plan.getTripCount();
+  // Skip cases for which the trip count may be non-trivial to materialize.
+  if (!Plan.hasScalarTail() ||
+      Plan.getMiddleBlock()->getSingleSuccessor() ==
+          Plan.getScalarPreheader() ||
+      !TC->isLiveIn())
+    return;
+  // Materialize vector trip counts for constants early if it can simply
+  // be computed as (Original TC / VF * UF) * VF * UF.
+  ScalarEvolution &SE = *PSE.getSE();
+  auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
+  const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
+  auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
+  if (auto *NewC = dyn_cast<SCEVConstant>(VecTCScev))
+    Plan.getVectorTripCount().setUnderlyingValue(NewC->getValue());
+}
+
 /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
 /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
 /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
@@ -3350,7 +3381,7 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
   if (VF.isScalable() && VScaleForTuning.has_value())
     VectorStep *= *VScaleForTuning;
   assert(VectorStep > 0 && "trip count should not be zero");
-  MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+  MDBuilder MDB(Plan.getContext());
   MDNode *BranchWeights =
       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
   MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ab189f6..d5af6cd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -224,6 +224,10 @@ struct VPlanTransforms {
   /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
   static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy);
 
+  /// Remove BranchOnCond recipes with true or false conditions together with
+  /// removing dead edges to their successors.
+  static void removeBranchOnConst(VPlan &Plan);
+
   /// If there's a single exit block, optimize its phi recipes that use exiting
   /// IV values by feeding them precomputed end values instead, possibly taken
   /// one step backwards.
@@ -234,6 +238,12 @@ struct VPlanTransforms {
   /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
   static void materializeBroadcasts(VPlan &Plan);
 
+  // Materialize vector trip counts for constants early if it can simply be
+  // computed as (Original TC / VF * UF) * VF * UF.
+  static void materializeVectorTripCount(VPlan &Plan, ElementCount BestVF,
+                                         unsigned BestUF,
+                                         PredicatedScalarEvolution &PSE);
+
   /// Try to convert a plan with interleave groups with VF elements to a plan
   /// with the interleave groups replaced by wide loads and stores processing VF
   /// elements, if all transformed interleave groups access the full vector
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index b89cd21..871e37e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -363,6 +363,13 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
       continue;
     }
     VPValue *Op0;
+    if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>(
+                      m_VPValue(Op0), m_VPValue(Op1)))) {
+      addUniformForAllParts(cast<VPInstruction>(&R));
+      for (unsigned Part = 1; Part != UF; ++Part)
+        R.addOperand(getValueForPart(Op1, Part));
+      continue;
+    }
     if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(
                       m_VPValue(Op0))) ||
         match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 82adc34..6252f4f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3174,6 +3174,55 @@ bool VectorCombine::foldCastFromReductions(Instruction &I) {
   return true;
 }
 
+/// Returns true if this ShuffleVectorInst eventually feeds into a
+/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
+/// chains of shuffles and binary operators (in any combination/order).
+/// The search does not go deeper than the given Depth.
+static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI) {
+  constexpr unsigned MaxVisited = 32;
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<Instruction *, 4> WorkList;
+  bool FoundReduction = false;
+
+  WorkList.push_back(SVI);
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    for (User *U : I->users()) {
+      auto *UI = cast<Instruction>(U);
+      if (!UI || !Visited.insert(UI).second)
+        continue;
+      if (Visited.size() > MaxVisited)
+        return false;
+      if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
+        // More than one reduction reached
+        if (FoundReduction)
+          return false;
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::vector_reduce_add:
+        case Intrinsic::vector_reduce_mul:
+        case Intrinsic::vector_reduce_and:
+        case Intrinsic::vector_reduce_or:
+        case Intrinsic::vector_reduce_xor:
+        case Intrinsic::vector_reduce_smin:
+        case Intrinsic::vector_reduce_smax:
+        case Intrinsic::vector_reduce_umin:
+        case Intrinsic::vector_reduce_umax:
+          FoundReduction = true;
+          continue;
+        default:
+          return false;
+        }
+      }
+
+      if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
+        return false;
+
+      WorkList.emplace_back(UI);
+    }
+  }
+  return FoundReduction;
+}
+
 /// This method looks for groups of shuffles acting on binops, of the form:
 ///  %x = shuffle ...
 ///  %y = shuffle ...
@@ -3416,6 +3465,65 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
            TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
   };
 
+  unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
+  unsigned MaxVectorSize =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+  unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
+  // When there are multiple shufflevector operations on the same input,
+  // especially when the vector length is larger than the register size,
+  // identical shuffle patterns may occur across different groups of elements.
+  // To avoid overestimating the cost by counting these repeated shuffles more
+  // than once, we only account for unique shuffle patterns. This adjustment
+  // prevents inflated costs in the cost model for wide vectors split into
+  // several register-sized groups.
+  std::set<SmallVector<int, 4>> UniqueShuffles;
+  auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
+    // Compute the cost for performing the shuffle over the full vector.
+    auto ShuffleCost =
+        TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, VT, Mask, CostKind);
+    unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
+    if (NumFullVectors < 2)
+      return C + ShuffleCost;
+    SmallVector<int, 4> SubShuffle(MaxElementsInVector);
+    unsigned NumUniqueGroups = 0;
+    unsigned NumGroups = Mask.size() / MaxElementsInVector;
+    // For each group of MaxElementsInVector contiguous elements,
+    // collect their shuffle pattern and insert into the set of unique patterns.
+    for (unsigned I = 0; I < NumFullVectors; ++I) {
+      for (unsigned J = 0; J < MaxElementsInVector; ++J)
+        SubShuffle[J] = Mask[MaxElementsInVector * I + J];
+      if (UniqueShuffles.insert(SubShuffle).second)
+        NumUniqueGroups += 1;
+    }
+    return C + ShuffleCost * NumUniqueGroups / NumGroups;
+  };
+  auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
+    auto *SV = dyn_cast<ShuffleVectorInst>(I);
+    if (!SV)
+      return C;
+    SmallVector<int, 16> Mask;
+    SV->getShuffleMask(Mask);
+    return AddShuffleMaskAdjustedCost(C, Mask);
+  };
+  // Check that input consists of ShuffleVectors applied to the same input
+  auto AllShufflesHaveSameOperands =
+      [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
+        if (InputShuffles.size() < 2)
+          return false;
+        ShuffleVectorInst *FirstSV =
+            dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
+        if (!FirstSV)
+          return false;
+
+        Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
+        return std::all_of(
+            std::next(InputShuffles.begin()), InputShuffles.end(),
+            [&](Instruction *I) {
+              ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
+              return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
+            });
+      };
+
   // Get the costs of the shuffles + binops before and after with the new
   // shuffle masks.
   InstructionCost CostBefore =
@@ -3423,8 +3531,14 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
       TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
   CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
                                 InstructionCost(0), AddShuffleCost);
-  CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
-                                InstructionCost(0), AddShuffleCost);
+  if (AllShufflesHaveSameOperands(InputShuffles)) {
+    UniqueShuffles.clear();
+    CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+                                  InstructionCost(0), AddShuffleAdjustedCost);
+  } else {
+    CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+                                  InstructionCost(0), AddShuffleCost);
+  }
 
   // The new binops will be unused for lanes past the used shuffle lengths.
   // These types attempt to get the correct cost for that from the target.
@@ -3435,8 +3549,9 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   InstructionCost CostAfter =
       TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
       TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
+  UniqueShuffles.clear();
   CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
-                               InstructionCost(0), AddShuffleMaskCost);
+                               InstructionCost(0), AddShuffleMaskAdjustedCost);
   std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
   CostAfter +=
       std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
@@ -3445,7 +3560,8 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
   LLVM_DEBUG(dbgs() << "  CostBefore: " << CostBefore
                     << " vs CostAfter: " << CostAfter << "\n");
-  if (CostBefore <= CostAfter)
+  if (CostBefore < CostAfter ||
+      (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
     return false;
 
   // The cost model has passed, create the new instructions.
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
index 9801d14..388362f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fcmp.ll
@@ -3,619 +3,619 @@
 
 define void @fcmp_oeq(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oeq'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oeq <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oeq <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oeq <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oeq <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oeq <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oeq <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oeq <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oeq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oeq <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oeq <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oeq <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oeq <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oeq <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oeq <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oeq <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp oeq <2 x float> undef, undef
-  %v4f32 = fcmp oeq <4 x float> undef, undef
-  %v8f32 = fcmp oeq <8 x float> undef, undef
-  %v2f64 = fcmp oeq <2 x double> undef, undef
-  %v4f64 = fcmp oeq <4 x double> undef, undef
-  %v2f16 = fcmp oeq <2 x half> undef, undef
-  %v4f16 = fcmp oeq <4 x half> undef, undef
-  %v8f16 = fcmp oeq <8 x half> undef, undef
-  %v16f16 = fcmp oeq <16 x half> undef, undef
+  %v2f32 = fcmp oeq <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp oeq <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp oeq <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp oeq <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp oeq <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp oeq <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp oeq <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp oeq <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp oeq <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_oeq_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oeq_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp oeq <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp oeq <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oeq <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp oeq <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp oeq <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp oeq <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp oeq <2 x bfloat> undef, undef
-  %v4bf16 = fcmp oeq <4 x bfloat> undef, undef
-  %v8bf16 = fcmp oeq <8 x bfloat> undef, undef
+  %v2bf16 = fcmp oeq <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp oeq <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp oeq <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_ogt(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ogt'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ogt <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ogt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ogt <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ogt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ogt <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ogt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ogt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ogt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ogt <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ogt <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ogt <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ogt <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ogt <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ogt <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ogt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp ogt <2 x float> undef, undef
-  %v4f32 = fcmp ogt <4 x float> undef, undef
-  %v8f32 = fcmp ogt <8 x float> undef, undef
-  %v2f64 = fcmp ogt <2 x double> undef, undef
-  %v4f64 = fcmp ogt <4 x double> undef, undef
-  %v2f16 = fcmp ogt <2 x half> undef, undef
-  %v4f16 = fcmp ogt <4 x half> undef, undef
-  %v8f16 = fcmp ogt <8 x half> undef, undef
-  %v16f16 = fcmp ogt <16 x half> undef, undef
+  %v2f32 = fcmp ogt <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp ogt <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp ogt <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp ogt <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp ogt <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp ogt <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp ogt <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp ogt <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp ogt <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_ogt_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ogt_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ogt <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ogt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ogt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp ogt <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp ogt <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp ogt <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp ogt <2 x bfloat> undef, undef
-  %v4bf16 = fcmp ogt <4 x bfloat> undef, undef
-  %v8bf16 = fcmp ogt <8 x bfloat> undef, undef
+  %v2bf16 = fcmp ogt <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp ogt <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp ogt <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_oge(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oge'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oge <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oge <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oge <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oge <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oge <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oge <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oge <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp oge <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp oge <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp oge <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp oge <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp oge <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp oge <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp oge <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp oge <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp oge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp oge <2 x float> undef, undef
-  %v4f32 = fcmp oge <4 x float> undef, undef
-  %v8f32 = fcmp oge <8 x float> undef, undef
-  %v2f64 = fcmp oge <2 x double> undef, undef
-  %v4f64 = fcmp oge <4 x double> undef, undef
-  %v2f16 = fcmp oge <2 x half> undef, undef
-  %v4f16 = fcmp oge <4 x half> undef, undef
-  %v8f16 = fcmp oge <8 x half> undef, undef
-  %v16f16 = fcmp oge <16 x half> undef, undef
+  %v2f32 = fcmp oge <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp oge <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp oge <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp oge <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp oge <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp oge <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp oge <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp oge <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp oge <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_oge_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_oge_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp oge <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp oge <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp oge <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp oge <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp oge <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp oge <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp oge <2 x bfloat> undef, undef
-  %v4bf16 = fcmp oge <4 x bfloat> undef, undef
-  %v8bf16 = fcmp oge <8 x bfloat> undef, undef
+  %v2bf16 = fcmp oge <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp oge <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp oge <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_olt(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_olt'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp olt <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp olt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp olt <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp olt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp olt <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp olt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp olt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp olt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp olt <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp olt <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp olt <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp olt <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp olt <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp olt <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp olt <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp olt <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp olt <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp olt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp olt <2 x float> undef, undef
-  %v4f32 = fcmp olt <4 x float> undef, undef
-  %v8f32 = fcmp olt <8 x float> undef, undef
-  %v2f64 = fcmp olt <2 x double> undef, undef
-  %v4f64 = fcmp olt <4 x double> undef, undef
-  %v2f16 = fcmp olt <2 x half> undef, undef
-  %v4f16 = fcmp olt <4 x half> undef, undef
-  %v8f16 = fcmp olt <8 x half> undef, undef
-  %v16f16 = fcmp olt <16 x half> undef, undef
+  %v2f32 = fcmp olt <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp olt <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp olt <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp olt <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp olt <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp olt <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp olt <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp olt <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp olt <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_olt_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_olt_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp olt <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp olt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp olt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp olt <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp olt <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp olt <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp olt <2 x bfloat> undef, undef
-  %v4bf16 = fcmp olt <4 x bfloat> undef, undef
-  %v8bf16 = fcmp olt <8 x bfloat> undef, undef
+  %v2bf16 = fcmp olt <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp olt <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp olt <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_ole(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ole'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ole <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ole <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ole <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ole <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ole <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ole <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ole <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ole <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ole <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ole <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ole <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ole <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ole <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ole <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ole <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ole <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ole <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ole <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp ole <2 x float> undef, undef
-  %v4f32 = fcmp ole <4 x float> undef, undef
-  %v8f32 = fcmp ole <8 x float> undef, undef
-  %v2f64 = fcmp ole <2 x double> undef, undef
-  %v4f64 = fcmp ole <4 x double> undef, undef
-  %v2f16 = fcmp ole <2 x half> undef, undef
-  %v4f16 = fcmp ole <4 x half> undef, undef
-  %v8f16 = fcmp ole <8 x half> undef, undef
-  %v16f16 = fcmp ole <16 x half> undef, undef
+  %v2f32 = fcmp ole <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp ole <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp ole <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp ole <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp ole <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp ole <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp ole <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp ole <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp ole <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_ole_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ole_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ole <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ole <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ole <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp ole <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp ole <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp ole <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp ole <2 x bfloat> undef, undef
-  %v4bf16 = fcmp ole <4 x bfloat> undef, undef
-  %v8bf16 = fcmp ole <8 x bfloat> undef, undef
+  %v2bf16 = fcmp ole <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp ole <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp ole <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_one(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_one'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp one <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp one <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp one <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp one <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp one <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp one <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp one <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp one <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp one <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp one <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp one <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp one <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp one <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp one <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp one <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp one <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp one <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp one <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp one <2 x float> undef, undef
-  %v4f32 = fcmp one <4 x float> undef, undef
-  %v8f32 = fcmp one <8 x float> undef, undef
-  %v2f64 = fcmp one <2 x double> undef, undef
-  %v4f64 = fcmp one <4 x double> undef, undef
-  %v2f16 = fcmp one <2 x half> undef, undef
-  %v4f16 = fcmp one <4 x half> undef, undef
-  %v8f16 = fcmp one <8 x half> undef, undef
-  %v16f16 = fcmp one <16 x half> undef, undef
+  %v2f32 = fcmp one <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp one <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp one <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp one <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp one <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp one <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp one <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp one <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp one <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_one_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_one_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp one <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp one <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp one <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp one <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp one <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp one <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp one <2 x bfloat> undef, undef
-  %v4bf16 = fcmp one <4 x bfloat> undef, undef
-  %v8bf16 = fcmp one <8 x bfloat> undef, undef
+  %v2bf16 = fcmp one <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp one <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp one <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_ord(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ord'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ord <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ord <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ord <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ord <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ord <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ord <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ord <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ord <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ord <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ord <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ord <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ord <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ord <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ord <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ord <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ord <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ord <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ord <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp ord <2 x float> undef, undef
-  %v4f32 = fcmp ord <4 x float> undef, undef
-  %v8f32 = fcmp ord <8 x float> undef, undef
-  %v2f64 = fcmp ord <2 x double> undef, undef
-  %v4f64 = fcmp ord <4 x double> undef, undef
-  %v2f16 = fcmp ord <2 x half> undef, undef
-  %v4f16 = fcmp ord <4 x half> undef, undef
-  %v8f16 = fcmp ord <8 x half> undef, undef
-  %v16f16 = fcmp ord <16 x half> undef, undef
+  %v2f32 = fcmp ord <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp ord <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp ord <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp ord <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp ord <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp ord <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp ord <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp ord <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp ord <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_ord_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ord_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ord <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ord <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ord <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp ord <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp ord <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp ord <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp ord <2 x bfloat> undef, undef
-  %v4bf16 = fcmp ord <4 x bfloat> undef, undef
-  %v8bf16 = fcmp ord <8 x bfloat> undef, undef
+  %v2bf16 = fcmp ord <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp ord <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp ord <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_ueq(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ueq'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ueq <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ueq <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ueq <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ueq <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ueq <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ueq <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ueq <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ueq <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ueq <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ueq <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ueq <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ueq <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ueq <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ueq <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ueq <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp ueq <2 x float> undef, undef
-  %v4f32 = fcmp ueq <4 x float> undef, undef
-  %v8f32 = fcmp ueq <8 x float> undef, undef
-  %v2f64 = fcmp ueq <2 x double> undef, undef
-  %v4f64 = fcmp ueq <4 x double> undef, undef
-  %v2f16 = fcmp ueq <2 x half> undef, undef
-  %v4f16 = fcmp ueq <4 x half> undef, undef
-  %v8f16 = fcmp ueq <8 x half> undef, undef
-  %v16f16 = fcmp ueq <16 x half> undef, undef
+  %v2f32 = fcmp ueq <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp ueq <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp ueq <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp ueq <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp ueq <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp ueq <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp ueq <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp ueq <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp ueq <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_ueq_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ueq_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ueq <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ueq <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ueq <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp ueq <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp ueq <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp ueq <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp ueq <2 x bfloat> undef, undef
-  %v4bf16 = fcmp ueq <4 x bfloat> undef, undef
-  %v8bf16 = fcmp ueq <8 x bfloat> undef, undef
+  %v2bf16 = fcmp ueq <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp ueq <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp ueq <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_ugt(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ugt'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ugt <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ugt <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ugt <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ugt <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ugt <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ugt <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ugt <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ugt <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ugt <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ugt <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ugt <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ugt <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ugt <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ugt <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ugt <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp ugt <2 x float> undef, undef
-  %v4f32 = fcmp ugt <4 x float> undef, undef
-  %v8f32 = fcmp ugt <8 x float> undef, undef
-  %v2f64 = fcmp ugt <2 x double> undef, undef
-  %v4f64 = fcmp ugt <4 x double> undef, undef
-  %v2f16 = fcmp ugt <2 x half> undef, undef
-  %v4f16 = fcmp ugt <4 x half> undef, undef
-  %v8f16 = fcmp ugt <8 x half> undef, undef
-  %v16f16 = fcmp ugt <16 x half> undef, undef
+  %v2f32 = fcmp ugt <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp ugt <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp ugt <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp ugt <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp ugt <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp ugt <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp ugt <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp ugt <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp ugt <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_ugt_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ugt_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ugt <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ugt <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ugt <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp ugt <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp ugt <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp ugt <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp ugt <2 x bfloat> undef, undef
-  %v4bf16 = fcmp ugt <4 x bfloat> undef, undef
-  %v8bf16 = fcmp ugt <8 x bfloat> undef, undef
+  %v2bf16 = fcmp ugt <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp ugt <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp ugt <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_uge(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uge'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uge <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uge <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uge <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uge <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uge <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uge <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uge <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uge <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uge <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uge <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uge <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uge <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uge <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uge <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uge <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uge <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uge <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uge <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp uge <2 x float> undef, undef
-  %v4f32 = fcmp uge <4 x float> undef, undef
-  %v8f32 = fcmp uge <8 x float> undef, undef
-  %v2f64 = fcmp uge <2 x double> undef, undef
-  %v4f64 = fcmp uge <4 x double> undef, undef
-  %v2f16 = fcmp uge <2 x half> undef, undef
-  %v4f16 = fcmp uge <4 x half> undef, undef
-  %v8f16 = fcmp uge <8 x half> undef, undef
-  %v16f16 = fcmp uge <16 x half> undef, undef
+  %v2f32 = fcmp uge <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp uge <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp uge <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp uge <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp uge <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp uge <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp uge <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp uge <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp uge <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_uge_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uge_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp uge <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp uge <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uge <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp uge <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp uge <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp uge <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp uge <2 x bfloat> undef, undef
-  %v4bf16 = fcmp uge <4 x bfloat> undef, undef
-  %v8bf16 = fcmp uge <8 x bfloat> undef, undef
+  %v2bf16 = fcmp uge <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp uge <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp uge <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_ult(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ult'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ult <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ult <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ult <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ult <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ult <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ult <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ult <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ult <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ult <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ult <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ult <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ult <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ult <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ult <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ult <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ult <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ult <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ult <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp ult <2 x float> undef, undef
-  %v4f32 = fcmp ult <4 x float> undef, undef
-  %v8f32 = fcmp ult <8 x float> undef, undef
-  %v2f64 = fcmp ult <2 x double> undef, undef
-  %v4f64 = fcmp ult <4 x double> undef, undef
-  %v2f16 = fcmp ult <2 x half> undef, undef
-  %v4f16 = fcmp ult <4 x half> undef, undef
-  %v8f16 = fcmp ult <8 x half> undef, undef
-  %v16f16 = fcmp ult <16 x half> undef, undef
+  %v2f32 = fcmp ult <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp ult <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp ult <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp ult <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp ult <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp ult <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp ult <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp ult <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp ult <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_ult_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ult_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ult <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ult <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ult <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp ult <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp ult <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp ult <vscale x 8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp ult <2 x bfloat> undef, undef
-  %v4bf16 = fcmp ult <4 x bfloat> undef, undef
-  %v8bf16 = fcmp ult <8 x bfloat> undef, undef
+  %v2bf16 = fcmp ult <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp ult <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp ult <vscale x 8 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_ule(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ule'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ule <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ule <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ule <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ule <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ule <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ule <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ule <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ule <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ule <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp ule <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp ule <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp ule <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp ule <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp ule <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp ule <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp ule <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp ule <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp ule <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp ule <2 x float> undef, undef
-  %v4f32 = fcmp ule <4 x float> undef, undef
-  %v8f32 = fcmp ule <8 x float> undef, undef
-  %v2f64 = fcmp ule <2 x double> undef, undef
-  %v4f64 = fcmp ule <4 x double> undef, undef
-  %v2f16 = fcmp ule <2 x half> undef, undef
-  %v4f16 = fcmp ule <4 x half> undef, undef
-  %v8f16 = fcmp ule <8 x half> undef, undef
-  %v16f16 = fcmp ule <16 x half> undef, undef
+  %v2f32 = fcmp ule <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp ule <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp ule <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp ule <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp ule <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp ule <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp ule <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp ule <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp ule <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_ule_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_ule_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp ule <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp ule <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp ule <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp ule <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp ule <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp ule <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp ule <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16bf16 = fcmp ule <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp ule <2 x bfloat> undef, undef
-  %v4bf16 = fcmp ule <4 x bfloat> undef, undef
-  %v8bf16 = fcmp ule <8 x bfloat> undef, undef
-  %v16bf16 = fcmp ule <16 x bfloat> undef, undef
+  %v2bf16 = fcmp ule <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp ule <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp ule <vscale x 8 x bfloat> undef, undef
+  %v16bf16 = fcmp ule <vscale x 16 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_une(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_une'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp une <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp une <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp une <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp une <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp une <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp une <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp une <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp une <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp une <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp une <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp une <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp une <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp une <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp une <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp une <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp une <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp une <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp une <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp une <2 x float> undef, undef
-  %v4f32 = fcmp une <4 x float> undef, undef
-  %v8f32 = fcmp une <8 x float> undef, undef
-  %v2f64 = fcmp une <2 x double> undef, undef
-  %v4f64 = fcmp une <4 x double> undef, undef
-  %v2f16 = fcmp une <2 x half> undef, undef
-  %v4f16 = fcmp une <4 x half> undef, undef
-  %v8f16 = fcmp une <8 x half> undef, undef
-  %v16f16 = fcmp une <16 x half> undef, undef
+  %v2f32 = fcmp une <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp une <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp une <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp une <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp une <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp une <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp une <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp une <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp une <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_une_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_une_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp une <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp une <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp une <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp une <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp une <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp une <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp une <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16bf16 = fcmp une <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp une <2 x bfloat> undef, undef
-  %v4bf16 = fcmp une <4 x bfloat> undef, undef
-  %v8bf16 = fcmp une <8 x bfloat> undef, undef
-  %v16bf16 = fcmp une <16 x bfloat> undef, undef
+  %v2bf16 = fcmp une <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp une <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp une <vscale x 8 x bfloat> undef, undef
+  %v16bf16 = fcmp une <vscale x 16 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_uno(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uno'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uno <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uno <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uno <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uno <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uno <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uno <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uno <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uno <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uno <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp uno <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp uno <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp uno <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp uno <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp uno <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp uno <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp uno <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp uno <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp uno <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp uno <2 x float> undef, undef
-  %v4f32 = fcmp uno <4 x float> undef, undef
-  %v8f32 = fcmp uno <8 x float> undef, undef
-  %v2f64 = fcmp uno <2 x double> undef, undef
-  %v4f64 = fcmp uno <4 x double> undef, undef
-  %v2f16 = fcmp uno <2 x half> undef, undef
-  %v4f16 = fcmp uno <4 x half> undef, undef
-  %v8f16 = fcmp uno <8 x half> undef, undef
-  %v16f16 = fcmp uno <16 x half> undef, undef
+  %v2f32 = fcmp uno <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp uno <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp uno <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp uno <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp uno <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp uno <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp uno <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp uno <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp uno <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_uno_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_uno_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp uno <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp uno <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp uno <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp uno <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp uno <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp uno <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp uno <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16bf16 = fcmp uno <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp uno <2 x bfloat> undef, undef
-  %v4bf16 = fcmp uno <4 x bfloat> undef, undef
-  %v8bf16 = fcmp uno <8 x bfloat> undef, undef
-  %v16bf16 = fcmp uno <16 x bfloat> undef, undef
+  %v2bf16 = fcmp uno <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp uno <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp uno <vscale x 8 x bfloat> undef, undef
+  %v16bf16 = fcmp uno <vscale x 16 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_true(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_true'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp true <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp true <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp true <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp true <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp true <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp true <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp true <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp true <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp true <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp true <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp true <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp true <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp true <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp true <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp true <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp true <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp true <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp true <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp true <2 x float> undef, undef
-  %v4f32 = fcmp true <4 x float> undef, undef
-  %v8f32 = fcmp true <8 x float> undef, undef
-  %v2f64 = fcmp true <2 x double> undef, undef
-  %v4f64 = fcmp true <4 x double> undef, undef
-  %v2f16 = fcmp true <2 x half> undef, undef
-  %v4f16 = fcmp true <4 x half> undef, undef
-  %v8f16 = fcmp true <8 x half> undef, undef
-  %v16f16 = fcmp true <16 x half> undef, undef
+  %v2f32 = fcmp true <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp true <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp true <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp true <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp true <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp true <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp true <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp true <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp true <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_true_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_true_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp true <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp true <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp true <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp true <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp true <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp true <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp true <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16bf16 = fcmp true <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp true <2 x bfloat> undef, undef
-  %v4bf16 = fcmp true <4 x bfloat> undef, undef
-  %v8bf16 = fcmp true <8 x bfloat> undef, undef
-  %v16bf16 = fcmp true <16 x bfloat> undef, undef
+  %v2bf16 = fcmp true <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp true <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp true <vscale x 8 x bfloat> undef, undef
+  %v16bf16 = fcmp true <vscale x 16 x bfloat> undef, undef
   ret void
 }
 
 define void @fcmp_false(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_false'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp false <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp false <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp false <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp false <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp false <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp false <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp false <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp false <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp false <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = fcmp false <vscale x 2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = fcmp false <vscale x 4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = fcmp false <vscale x 8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = fcmp false <vscale x 2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = fcmp false <vscale x 4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f16 = fcmp false <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f16 = fcmp false <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = fcmp false <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = fcmp false <vscale x 16 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2f32 = fcmp false <2 x float> undef, undef
-  %v4f32 = fcmp false <4 x float> undef, undef
-  %v8f32 = fcmp false <8 x float> undef, undef
-  %v2f64 = fcmp false <2 x double> undef, undef
-  %v4f64 = fcmp false <4 x double> undef, undef
-  %v2f16 = fcmp false <2 x half> undef, undef
-  %v4f16 = fcmp false <4 x half> undef, undef
-  %v8f16 = fcmp false <8 x half> undef, undef
-  %v16f16 = fcmp false <16 x half> undef, undef
+  %v2f32 = fcmp false <vscale x 2 x float> undef, undef
+  %v4f32 = fcmp false <vscale x 4 x float> undef, undef
+  %v8f32 = fcmp false <vscale x 8 x float> undef, undef
+  %v2f64 = fcmp false <vscale x 2 x double> undef, undef
+  %v4f64 = fcmp false <vscale x 4 x double> undef, undef
+  %v2f16 = fcmp false <vscale x 2 x half> undef, undef
+  %v4f16 = fcmp false <vscale x 4 x half> undef, undef
+  %v8f16 = fcmp false <vscale x 8 x half> undef, undef
+  %v16f16 = fcmp false <vscale x 16 x half> undef, undef
   ret void
 }
 
 define void @fcmp_false_bfloat(i32 %arg) {
 ; CHECK-LABEL: 'fcmp_false_bfloat'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2bf16 = fcmp false <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v4bf16 = fcmp false <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:5 SizeLat:5 for: %v8bf16 = fcmp false <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:7 SizeLat:7 for: %v16bf16 = fcmp false <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = fcmp false <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = fcmp false <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = fcmp false <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16bf16 = fcmp false <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %v2bf16 = fcmp false <2 x bfloat> undef, undef
-  %v4bf16 = fcmp false <4 x bfloat> undef, undef
-  %v8bf16 = fcmp false <8 x bfloat> undef, undef
-  %v16bf16 = fcmp false <16 x bfloat> undef, undef
+  %v2bf16 = fcmp false <vscale x 2 x bfloat> undef, undef
+  %v4bf16 = fcmp false <vscale x 4 x bfloat> undef, undef
+  %v8bf16 = fcmp false <vscale x 8 x bfloat> undef, undef
+  %v16bf16 = fcmp false <vscale x 16 x bfloat> undef, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 6c974af..673bf38 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -1,1579 +1,2413 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d < %s | FileCheck %s --check-prefixes=CHECK,NO-ZFHMIN
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
 define void @fadd() {
 ; CHECK-LABEL: 'fadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fadd bfloat undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fadd <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fadd <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fadd <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fadd <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fadd <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fadd <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fadd <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fadd <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fadd <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fadd <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fadd <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fadd <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fadd <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fadd <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fadd <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fadd <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fadd <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fadd <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fadd <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fadd <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fadd <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fadd <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x bfloat> @llvm.vp.fadd.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x bfloat> @llvm.vp.fadd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x bfloat> @llvm.vp.fadd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x bfloat> @llvm.vp.fadd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x bfloat> @llvm.vp.fadd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fadd.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fadd.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fadd <1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fadd <2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fadd <4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fadd <8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fadd <16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fadd <vscale x 1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fadd <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fadd <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fadd <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fadd <vscale x 16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fadd <1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fadd <2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fadd <4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fadd <8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fadd <vscale x 1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fadd <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fadd <vscale x 4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fadd <vscale x 8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fadd.v1f32(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fadd.v1f64(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %BF16 = fadd bfloat undef, undef
-  %F32 = fadd float undef, undef
-  %F64 = fadd double undef, undef
-
-  %V1BF16 = fadd <1 x bfloat> undef, undef
-  %V2BF16 = fadd <2 x bfloat> undef, undef
-  %V4BF16 = fadd <4 x bfloat> undef, undef
-  %V8BF16 = fadd <8 x bfloat> undef, undef
-  %V16BF16 = fadd <16 x bfloat> undef, undef
-
-  %NXV1BF16 = fadd <vscale x 1 x bfloat> undef, undef
-  %NXV2BF16 = fadd <vscale x 2 x bfloat> undef, undef
-  %NXV4BF16 = fadd <vscale x 4 x bfloat> undef, undef
-  %NXV8BF16 = fadd <vscale x 8 x bfloat> undef, undef
-  %NXV16BF16 = fadd <vscale x 16 x bfloat> undef, undef
-
-  %V1F32 = fadd <1 x float> undef, undef
-  %V2F32 = fadd <2 x float> undef, undef
-  %V4F32 = fadd <4 x float> undef, undef
-  %V8F32 = fadd <8 x float> undef, undef
-  %V16F32 = fadd <16 x float> undef, undef
-
-  %NXV1F32 = fadd <vscale x 1 x float> undef, undef
-  %NXV2F32 = fadd <vscale x 2 x float> undef, undef
-  %NXV4F32 = fadd <vscale x 4 x float> undef, undef
-  %NXV8F32 = fadd <vscale x 8 x float> undef, undef
-  %NXV16F32 = fadd <vscale x 16 x float> undef, undef
-
-  %V1F64 = fadd <1 x double> undef, undef
-  %V2F64 = fadd <2 x double> undef, undef
-  %V4F64 = fadd <4 x double> undef, undef
-  %V8F64 = fadd <8 x double> undef, undef
-
-  %NXV1F64 = fadd <vscale x 1 x double> undef, undef
-  %NXV2F64 = fadd <vscale x 2 x double> undef, undef
-  %NXV4F64 = fadd <vscale x 4 x double> undef, undef
-  %NXV8F64 = fadd <vscale x 8 x double> undef, undef
-
-  %V1F16_VP = call <1 x bfloat> @llvm.vp.fadd.v1f16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x bfloat> @llvm.vp.fadd.v2f16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x bfloat> @llvm.vp.fadd.v4f16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x bfloat> @llvm.vp.fadd.v8f16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x bfloat> @llvm.vp.fadd.v16f16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-
-  %V1F32_VP = call <1 x float> @llvm.vp.fadd.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-  %V2F32_VP = call <2 x float> @llvm.vp.fadd.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-  %V4F32_VP = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-  %V8F32_VP = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-  %V16F32_VP = call <16 x float> @llvm.vp.fadd.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-
-  %V1F64_VP = call <1 x double> @llvm.vp.fadd.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-  %V2F64_VP = call <2 x double> @llvm.vp.fadd.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-  %V4F64_VP = call <4 x double> @llvm.vp.fadd.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-  %V8F64_VP = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1f16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2f16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4f16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8f16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16f16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fadd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fadd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fadd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fadd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %F32 = fadd float poison, poison
+  %F64 = fadd double poison, poison
+
+  %V1F32 = fadd <1 x float> poison, poison
+  %V2F32 = fadd <2 x float> poison, poison
+  %V4F32 = fadd <4 x float> poison, poison
+  %V8F32 = fadd <8 x float> poison, poison
+  %V16F32 = fadd <16 x float> poison, poison
+
+  %NXV1F32 = fadd <vscale x 1 x float> poison, poison
+  %NXV2F32 = fadd <vscale x 2 x float> poison, poison
+  %NXV4F32 = fadd <vscale x 4 x float> poison, poison
+  %NXV8F32 = fadd <vscale x 8 x float> poison, poison
+  %NXV16F32 = fadd <vscale x 16 x float> poison, poison
+
+  %V1F64 = fadd <1 x double> poison, poison
+  %V2F64 = fadd <2 x double> poison, poison
+  %V4F64 = fadd <4 x double> poison, poison
+  %V8F64 = fadd <8 x double> poison, poison
+
+  %NXV1F64 = fadd <vscale x 1 x double> poison, poison
+  %NXV2F64 = fadd <vscale x 2 x double> poison, poison
+  %NXV4F64 = fadd <vscale x 4 x double> poison, poison
+  %NXV8F64 = fadd <vscale x 8 x double> poison, poison
+
+  %V1F32_VP = call <1 x float> @llvm.vp.fadd(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+  %V2F32_VP = call <2 x float> @llvm.vp.fadd(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+  %V4F32_VP = call <4 x float> @llvm.vp.fadd(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+  %V8F32_VP = call <8 x float> @llvm.vp.fadd(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+  %V16F32_VP = call <16 x float> @llvm.vp.fadd(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+
+  %V1F64_VP = call <1 x double> @llvm.vp.fadd(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+  %V2F64_VP = call <2 x double> @llvm.vp.fadd(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+  %V4F64_VP = call <4 x double> @llvm.vp.fadd(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+  %V8F64_VP = call <8 x double> @llvm.vp.fadd(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+
+  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fadd(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fadd(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fadd(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fadd(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fadd(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+
+  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fadd(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fadd(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fadd(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fadd(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+
+  ret void
+}
+
+define void @fadd_bf16() {
+; ZVFH-LABEL: 'fadd_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fadd bfloat poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fadd <1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fadd <2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fadd <4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fadd <8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fadd <16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fadd <vscale x 1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fadd <vscale x 2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fadd.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fadd.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fadd.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fadd.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fadd.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fadd_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fadd bfloat poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fadd <1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fadd <2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fadd <4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fadd <8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fadd <16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fadd <vscale x 1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fadd <vscale x 2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fadd.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fadd.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fadd.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fadd.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fadd.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fadd_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fadd bfloat poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fadd <1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = fadd <2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = fadd <4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = fadd <8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = fadd <16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = fadd <vscale x 1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = fadd <vscale x 2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fadd.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fadd.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fadd.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fadd.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fadd.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = fadd bfloat poison, poison
+
+  %V1BF16 = fadd <1 x bfloat> poison, poison
+  %V2BF16 = fadd <2 x bfloat> poison, poison
+  %V4BF16 = fadd <4 x bfloat> poison, poison
+  %V8BF16 = fadd <8 x bfloat> poison, poison
+  %V16BF16 = fadd <16 x bfloat> poison, poison
+
+  %NXV1BF16 = fadd <vscale x 1 x bfloat> poison, poison
+  %NXV2BF16 = fadd <vscale x 2 x bfloat> poison, poison
+  %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+
+  %V1BF16_VP = call <1 x bfloat> @llvm.vp.fadd(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+  %V2BF16_VP = call <2 x bfloat> @llvm.vp.fadd(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+  %V4BF16_VP = call <4 x bfloat> @llvm.vp.fadd(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+  %V8BF16_VP = call <8 x bfloat> @llvm.vp.fadd(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+  %V16BF16_VP = call <16 x bfloat> @llvm.vp.fadd(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fadd(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fadd(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fadd(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fadd(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fadd(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fadd_f16() {
 ; ZVFH-LABEL: 'fadd_f16'
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fadd <2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fadd <32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fadd.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fadd <2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fadd <4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fadd <8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fadd <16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fadd <32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fadd <vscale x 1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fadd <vscale x 2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fadd <vscale x 4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fadd <vscale x 8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fadd <vscale x 16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fadd.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; ZVFHMIN-LABEL: 'fadd_f16'
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fadd <1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fadd <2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fadd <4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fadd <8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fadd <16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fadd <32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fadd <vscale x 1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fadd <vscale x 2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fadd <vscale x 4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fadd <vscale x 8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fadd <vscale x 16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fadd.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fadd <1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fadd <2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fadd <4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fadd <8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fadd <16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fadd <32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fadd <vscale x 1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fadd <vscale x 2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fadd <vscale x 4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fadd <vscale x 8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fadd <vscale x 16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fadd <vscale x 32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fadd.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = fadd half undef, undef
-
-  %V1F16 = fadd <1 x half> undef, undef
-  %V2F16 = fadd <2 x half> undef, undef
-  %V4F16 = fadd <4 x half> undef, undef
-  %V8F16 = fadd <8 x half> undef, undef
-  %V16F16 = fadd <16 x half> undef, undef
-  %V32F16 = fadd <32 x half> undef, undef
-
-  %NXV1F16 = fadd <vscale x 1 x half> undef, undef
-  %NXV2F16 = fadd <vscale x 2 x half> undef, undef
-  %NXV4F16 = fadd <vscale x 4 x half> undef, undef
-  %NXV8F16 = fadd <vscale x 8 x half> undef, undef
-  %NXV16F16 = fadd <vscale x 16 x half> undef, undef
-  %NXV32F16 = fadd <vscale x 32 x half> undef, undef
-
-  %V1F16_VP = call <1 x half> @llvm.vp.fadd.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; NO-ZFHMIN-LABEL: 'fadd_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fadd half poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fadd <1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fadd <2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = fadd <4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = fadd <8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = fadd <16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = fadd <32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = fadd <vscale x 1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = fadd <vscale x 2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = fadd <vscale x 4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = fadd <vscale x 8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = fadd <vscale x 16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = fadd <vscale x 32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fadd.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fadd.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fadd.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fadd.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fadd.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %F16 = fadd half poison, poison
+
+  %V1F16 = fadd <1 x half> poison, poison
+  %V2F16 = fadd <2 x half> poison, poison
+  %V4F16 = fadd <4 x half> poison, poison
+  %V8F16 = fadd <8 x half> poison, poison
+  %V16F16 = fadd <16 x half> poison, poison
+  %V32F16 = fadd <32 x half> poison, poison
+
+  %NXV1F16 = fadd <vscale x 1 x half> poison, poison
+  %NXV2F16 = fadd <vscale x 2 x half> poison, poison
+  %NXV4F16 = fadd <vscale x 4 x half> poison, poison
+  %NXV8F16 = fadd <vscale x 8 x half> poison, poison
+  %NXV16F16 = fadd <vscale x 16 x half> poison, poison
+  %NXV32F16 = fadd <vscale x 32 x half> poison, poison
+
+  %V1F16_VP = call <1 x half> @llvm.vp.fadd(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+  %V2F16_VP = call <2 x half> @llvm.vp.fadd(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+  %V4F16_VP = call <4 x half> @llvm.vp.fadd(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+  %V8F16_VP = call <8 x half> @llvm.vp.fadd(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+  %V16F16_VP = call <16 x half> @llvm.vp.fadd(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fadd(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fadd(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fadd(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fadd(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fadd(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fsub() {
 ; CHECK-LABEL: 'fsub'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fsub <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fsub <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fsub <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fsub <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fsub <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fsub <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fsub <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fsub <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fsub <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fsub <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fsub <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fsub <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fsub <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fsub <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fsub <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fsub <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fsub <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fsub <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fsub <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fsub <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fsub <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fsub <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fsub <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x bfloat> @llvm.vp.fsub.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x bfloat> @llvm.vp.fsub.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x bfloat> @llvm.vp.fsub.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x bfloat> @llvm.vp.fsub.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x bfloat> @llvm.vp.fsub.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fsub.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fsub.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fsub.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fsub.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fsub.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fsub.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fsub.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fsub.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fsub.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fsub.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fsub.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fsub.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fsub.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fsub.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fsub.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fsub.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fsub <1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fsub <2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fsub <4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fsub <8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fsub <16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fsub <vscale x 1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fsub <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fsub <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fsub <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fsub <vscale x 16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fsub <1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fsub <2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fsub <4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fsub <8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fsub <vscale x 1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fsub <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fsub <vscale x 4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fsub <vscale x 8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fsub.v1f32(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fsub.v2f32(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fsub.v8f32(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fsub.v16f32(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fsub.v1f64(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fsub.v2f64(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fsub.v4f64(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fsub.v8f64(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fsub.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fsub.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fsub.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fsub.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fsub.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fsub.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fsub.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fsub.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = fsub half undef, undef
-  %F32 = fsub float undef, undef
-  %F64 = fsub double undef, undef
-
-  %V1BF16 = fsub <1 x bfloat> undef, undef
-  %V2BF16 = fsub <2 x bfloat> undef, undef
-  %V4BF16 = fsub <4 x bfloat> undef, undef
-  %V8BF16 = fsub <8 x bfloat> undef, undef
-  %V16BF16 = fsub <16 x bfloat> undef, undef
-
-  %NXV1BF16 = fsub <vscale x 1 x bfloat> undef, undef
-  %NXV2BF16 = fsub <vscale x 2 x bfloat> undef, undef
-  %NXV4BF16 = fsub <vscale x 4 x bfloat> undef, undef
-  %NXV8BF16 = fsub <vscale x 8 x bfloat> undef, undef
-  %NXV16BF16 = fsub <vscale x 16 x bfloat> undef, undef
-
-  %V1F32 = fsub <1 x float> undef, undef
-  %V2F32 = fsub <2 x float> undef, undef
-  %V4F32 = fsub <4 x float> undef, undef
-  %V8F32 = fsub <8 x float> undef, undef
-  %V16F32 = fsub <16 x float> undef, undef
-
-  %NXV1F32 = fsub <vscale x 1 x float> undef, undef
-  %NXV2F32 = fsub <vscale x 2 x float> undef, undef
-  %NXV4F32 = fsub <vscale x 4 x float> undef, undef
-  %NXV8F32 = fsub <vscale x 8 x float> undef, undef
-  %NXV16F32 = fsub <vscale x 16 x float> undef, undef
-
-  %V1F64 = fsub <1 x double> undef, undef
-  %V2F64 = fsub <2 x double> undef, undef
-  %V4F64 = fsub <4 x double> undef, undef
-  %V8F64 = fsub <8 x double> undef, undef
-
-  %NXV1F64 = fsub <vscale x 1 x double> undef, undef
-  %NXV2F64 = fsub <vscale x 2 x double> undef, undef
-  %NXV4F64 = fsub <vscale x 4 x double> undef, undef
-  %NXV8F64 = fsub <vscale x 8 x double> undef, undef
-
-  %V1F16_VP = call <1 x bfloat> @llvm.vp.fsub.v1f16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x bfloat> @llvm.vp.fsub.v2f16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x bfloat> @llvm.vp.fsub.v4f16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x bfloat> @llvm.vp.fsub.v8f16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x bfloat> @llvm.vp.fsub.v16f16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-
-  %V1F32_VP = call <1 x float> @llvm.vp.fsub.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-  %V2F32_VP = call <2 x float> @llvm.vp.fsub.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-  %V4F32_VP = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-  %V8F32_VP = call <8 x float> @llvm.vp.fsub.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-  %V16F32_VP = call <16 x float> @llvm.vp.fsub.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-
-  %V1F64_VP = call <1 x double> @llvm.vp.fsub.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-  %V2F64_VP = call <2 x double> @llvm.vp.fsub.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-  %V4F64_VP = call <4 x double> @llvm.vp.fsub.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-  %V8F64_VP = call <8 x double> @llvm.vp.fsub.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1f16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2f16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4f16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8f16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16f16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fsub.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fsub.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fsub.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fsub.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fsub.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fsub.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fsub.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fsub.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %F32 = fsub float poison, poison
+  %F64 = fsub double poison, poison
+
+  %V1F32 = fsub <1 x float> poison, poison
+  %V2F32 = fsub <2 x float> poison, poison
+  %V4F32 = fsub <4 x float> poison, poison
+  %V8F32 = fsub <8 x float> poison, poison
+  %V16F32 = fsub <16 x float> poison, poison
+
+  %NXV1F32 = fsub <vscale x 1 x float> poison, poison
+  %NXV2F32 = fsub <vscale x 2 x float> poison, poison
+  %NXV4F32 = fsub <vscale x 4 x float> poison, poison
+  %NXV8F32 = fsub <vscale x 8 x float> poison, poison
+  %NXV16F32 = fsub <vscale x 16 x float> poison, poison
+
+  %V1F64 = fsub <1 x double> poison, poison
+  %V2F64 = fsub <2 x double> poison, poison
+  %V4F64 = fsub <4 x double> poison, poison
+  %V8F64 = fsub <8 x double> poison, poison
+
+  %NXV1F64 = fsub <vscale x 1 x double> poison, poison
+  %NXV2F64 = fsub <vscale x 2 x double> poison, poison
+  %NXV4F64 = fsub <vscale x 4 x double> poison, poison
+  %NXV8F64 = fsub <vscale x 8 x double> poison, poison
+
+  %V1F32_VP = call <1 x float> @llvm.vp.fsub(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+  %V2F32_VP = call <2 x float> @llvm.vp.fsub(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+  %V4F32_VP = call <4 x float> @llvm.vp.fsub(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+  %V8F32_VP = call <8 x float> @llvm.vp.fsub(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+  %V16F32_VP = call <16 x float> @llvm.vp.fsub(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+
+  %V1F64_VP = call <1 x double> @llvm.vp.fsub(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+  %V2F64_VP = call <2 x double> @llvm.vp.fsub(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+  %V4F64_VP = call <4 x double> @llvm.vp.fsub(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+  %V8F64_VP = call <8 x double> @llvm.vp.fsub(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+
+  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fsub(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fsub(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fsub(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fsub(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fsub(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+
+  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fsub(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fsub(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fsub(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fsub(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+
+  ret void
+}
+
+define void @fsub_bf16() {
+; ZVFH-LABEL: 'fsub_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fsub bfloat poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fsub <1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fsub <2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fsub <4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fsub <8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fsub <16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32BF16 = fsub <32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fsub <vscale x 1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fsub <vscale x 2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fsub <vscale x 32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fsub.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fsub.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fsub.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fsub.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fsub.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fsub_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fsub bfloat poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fsub <1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fsub <2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fsub <4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fsub <8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fsub <16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32BF16 = fsub <32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fsub <vscale x 1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fsub <vscale x 2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fsub <vscale x 32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fsub.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fsub.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fsub.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fsub.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fsub.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fsub_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fsub bfloat poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fsub <1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = fsub <2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = fsub <4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = fsub <8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = fsub <16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32BF16 = fsub <32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = fsub <vscale x 1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = fsub <vscale x 2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = fsub <vscale x 32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fsub.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fsub.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fsub.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fsub.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fsub.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fsub.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fsub.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fsub.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fsub.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fsub.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = fsub bfloat poison, poison
+
+  %V1BF16 = fsub <1 x bfloat> poison, poison
+  %V2BF16 = fsub <2 x bfloat> poison, poison
+  %V4BF16 = fsub <4 x bfloat> poison, poison
+  %V8BF16 = fsub <8 x bfloat> poison, poison
+  %V16BF16 = fsub <16 x bfloat> poison, poison
+  %V32BF16 = fsub <32 x bfloat> poison, poison
+
+  %NXV1BF16 = fsub <vscale x 1 x bfloat> poison, poison
+  %NXV2BF16 = fsub <vscale x 2 x bfloat> poison, poison
+  %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+  %NXV32BF16 = fsub <vscale x 32 x bfloat> poison, poison
+
+  %V1BF16_VP = call <1 x bfloat> @llvm.vp.fsub(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+  %V2BF16_VP = call <2 x bfloat> @llvm.vp.fsub(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+  %V4BF16_VP = call <4 x bfloat> @llvm.vp.fsub(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+  %V8BF16_VP = call <8 x bfloat> @llvm.vp.fsub(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+  %V16BF16_VP = call <16 x bfloat> @llvm.vp.fsub(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fsub(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fsub(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fsub(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fsub(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fsub(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fsub_f16() {
 ; ZVFH-LABEL: 'fsub_f16'
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fsub <2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fsub <32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fsub.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fsub <2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fsub <4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fsub <8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fsub <16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fsub <32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fsub <vscale x 1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fsub <vscale x 2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fsub <vscale x 4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fsub <vscale x 8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fsub <vscale x 16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fsub.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; ZVFHMIN-LABEL: 'fsub_f16'
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fsub <1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fsub <2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fsub <4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fsub <8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fsub <16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fsub <32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fsub <vscale x 1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fsub <vscale x 2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fsub <vscale x 4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fsub <vscale x 8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fsub <vscale x 16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fsub.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fsub <1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fsub <2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fsub <4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fsub <8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fsub <16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fsub <32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fsub <vscale x 1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fsub <vscale x 2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fsub <vscale x 4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fsub <vscale x 8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fsub <vscale x 16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fsub <vscale x 32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fsub.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = fsub half undef, undef
-
-  %V1F16 = fsub <1 x half> undef, undef
-  %V2F16 = fsub <2 x half> undef, undef
-  %V4F16 = fsub <4 x half> undef, undef
-  %V8F16 = fsub <8 x half> undef, undef
-  %V16F16 = fsub <16 x half> undef, undef
-  %V32F16 = fsub <32 x half> undef, undef
-
-  %NXV1F16 = fsub <vscale x 1 x half> undef, undef
-  %NXV2F16 = fsub <vscale x 2 x half> undef, undef
-  %NXV4F16 = fsub <vscale x 4 x half> undef, undef
-  %NXV8F16 = fsub <vscale x 8 x half> undef, undef
-  %NXV16F16 = fsub <vscale x 16 x half> undef, undef
-  %NXV32F16 = fsub <vscale x 32 x half> undef, undef
-
-  %V1F16_VP = call <1 x half> @llvm.vp.fsub.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; NO-ZFHMIN-LABEL: 'fsub_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fsub <1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fsub <2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = fsub <4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = fsub <8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = fsub <16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = fsub <32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = fsub <vscale x 1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = fsub <vscale x 2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = fsub <vscale x 4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = fsub <vscale x 8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = fsub <vscale x 16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = fsub <vscale x 32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fsub.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fsub.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fsub.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fsub.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fsub.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fsub.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fsub.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fsub.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fsub.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fsub.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %F16 = fsub half poison, poison
+
+  %V1F16 = fsub <1 x half> poison, poison
+  %V2F16 = fsub <2 x half> poison, poison
+  %V4F16 = fsub <4 x half> poison, poison
+  %V8F16 = fsub <8 x half> poison, poison
+  %V16F16 = fsub <16 x half> poison, poison
+  %V32F16 = fsub <32 x half> poison, poison
+
+  %NXV1F16 = fsub <vscale x 1 x half> poison, poison
+  %NXV2F16 = fsub <vscale x 2 x half> poison, poison
+  %NXV4F16 = fsub <vscale x 4 x half> poison, poison
+  %NXV8F16 = fsub <vscale x 8 x half> poison, poison
+  %NXV16F16 = fsub <vscale x 16 x half> poison, poison
+  %NXV32F16 = fsub <vscale x 32 x half> poison, poison
+
+  %V1F16_VP = call <1 x half> @llvm.vp.fsub(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+  %V2F16_VP = call <2 x half> @llvm.vp.fsub(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+  %V4F16_VP = call <4 x half> @llvm.vp.fsub(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+  %V8F16_VP = call <8 x half> @llvm.vp.fsub(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+  %V16F16_VP = call <16 x half> @llvm.vp.fsub(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fsub(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fsub(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fsub(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fsub(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fsub(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fmul() {
 ; CHECK-LABEL: 'fmul'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fmul bfloat undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fmul <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fmul <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fmul <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fmul <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fmul <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fmul <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fmul <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fmul <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fmul <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fmul <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fmul <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fmul <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fmul <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fmul <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fmul <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fmul <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fmul <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fmul <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fmul <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fmul <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fmul <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fmul <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fmul <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x bfloat> @llvm.vp.fmul.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x bfloat> @llvm.vp.fmul.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x bfloat> @llvm.vp.fmul.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x bfloat> @llvm.vp.fmul.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x bfloat> @llvm.vp.fmul.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fmul.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fmul.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fmul.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fmul.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fmul.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fmul.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fmul.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fmul.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fmul.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fmul.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fmul.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fmul.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fmul.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fmul.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fmul.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fmul.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fmul.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fmul.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fmul.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fmul.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fmul.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fmul <1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul <2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul <4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fmul <8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fmul <16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fmul <vscale x 1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fmul <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fmul <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fmul <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fmul <vscale x 16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fmul <1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fmul <vscale x 1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fmul <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fmul <vscale x 4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fmul <vscale x 8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fmul.v1f32(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fmul.v2f32(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fmul.v8f32(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fmul.v16f32(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fmul.v1f64(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fmul.v2f64(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fmul.v4f64(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fmul.v8f64(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fmul.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fmul.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fmul.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fmul.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fmul.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fmul.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fmul.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fmul.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %BF16 = fmul bfloat undef, undef
-  %F32 = fmul float undef, undef
-  %F64 = fmul double undef, undef
-
-  %V1BF16 = fmul <1 x bfloat> undef, undef
-  %V2BF16 = fmul <2 x bfloat> undef, undef
-  %V4BF16 = fmul <4 x bfloat> undef, undef
-  %V8BF16 = fmul <8 x bfloat> undef, undef
-  %V16BF16 = fmul <16 x bfloat> undef, undef
-
-  %NXV1BF16 = fmul <vscale x 1 x bfloat> undef, undef
-  %NXV2BF16 = fmul <vscale x 2 x bfloat> undef, undef
-  %NXV4BF16 = fmul <vscale x 4 x bfloat> undef, undef
-  %NXV8BF16 = fmul <vscale x 8 x bfloat> undef, undef
-  %NXV16BF16 = fmul <vscale x 16 x bfloat> undef, undef
-
-  %V1F32 = fmul <1 x float> undef, undef
-  %V2F32 = fmul <2 x float> undef, undef
-  %V4F32 = fmul <4 x float> undef, undef
-  %V8F32 = fmul <8 x float> undef, undef
-  %V16F32 = fmul <16 x float> undef, undef
-
-  %NXV1F32 = fmul <vscale x 1 x float> undef, undef
-  %NXV2F32 = fmul <vscale x 2 x float> undef, undef
-  %NXV4F32 = fmul <vscale x 4 x float> undef, undef
-  %NXV8F32 = fmul <vscale x 8 x float> undef, undef
-  %NXV16F32 = fmul <vscale x 16 x float> undef, undef
-
-  %V1F64 = fmul <1 x double> undef, undef
-  %V2F64 = fmul <2 x double> undef, undef
-  %V4F64 = fmul <4 x double> undef, undef
-  %V8F64 = fmul <8 x double> undef, undef
-
-  %NXV1F64 = fmul <vscale x 1 x double> undef, undef
-  %NXV2F64 = fmul <vscale x 2 x double> undef, undef
-  %NXV4F64 = fmul <vscale x 4 x double> undef, undef
-  %NXV8F64 = fmul <vscale x 8 x double> undef, undef
-
-  %V1F16_VP = call <1 x bfloat> @llvm.vp.fmul.v1f16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x bfloat> @llvm.vp.fmul.v2f16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x bfloat> @llvm.vp.fmul.v4f16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x bfloat> @llvm.vp.fmul.v8f16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x bfloat> @llvm.vp.fmul.v16f16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-
-  %V1F32_VP = call <1 x float> @llvm.vp.fmul.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-  %V2F32_VP = call <2 x float> @llvm.vp.fmul.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-  %V4F32_VP = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-  %V8F32_VP = call <8 x float> @llvm.vp.fmul.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-  %V16F32_VP = call <16 x float> @llvm.vp.fmul.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-
-  %V1F64_VP = call <1 x double> @llvm.vp.fmul.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-  %V2F64_VP = call <2 x double> @llvm.vp.fmul.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-  %V4F64_VP = call <4 x double> @llvm.vp.fmul.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-  %V8F64_VP = call <8 x double> @llvm.vp.fmul.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fmul.nxv1f16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fmul.nxv2f16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fmul.nxv4f16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fmul.nxv8f16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fmul.nxv16f16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fmul.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fmul.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fmul.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fmul.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fmul.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fmul.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fmul.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fmul.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %F32 = fmul float poison, poison
+  %F64 = fmul double poison, poison
+
+  %V1F32 = fmul <1 x float> poison, poison
+  %V2F32 = fmul <2 x float> poison, poison
+  %V4F32 = fmul <4 x float> poison, poison
+  %V8F32 = fmul <8 x float> poison, poison
+  %V16F32 = fmul <16 x float> poison, poison
+
+  %NXV1F32 = fmul <vscale x 1 x float> poison, poison
+  %NXV2F32 = fmul <vscale x 2 x float> poison, poison
+  %NXV4F32 = fmul <vscale x 4 x float> poison, poison
+  %NXV8F32 = fmul <vscale x 8 x float> poison, poison
+  %NXV16F32 = fmul <vscale x 16 x float> poison, poison
+
+  %V1F64 = fmul <1 x double> poison, poison
+  %V2F64 = fmul <2 x double> poison, poison
+  %V4F64 = fmul <4 x double> poison, poison
+  %V8F64 = fmul <8 x double> poison, poison
+
+  %NXV1F64 = fmul <vscale x 1 x double> poison, poison
+  %NXV2F64 = fmul <vscale x 2 x double> poison, poison
+  %NXV4F64 = fmul <vscale x 4 x double> poison, poison
+  %NXV8F64 = fmul <vscale x 8 x double> poison, poison
+
+  %V1F32_VP = call <1 x float> @llvm.vp.fmul(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+  %V2F32_VP = call <2 x float> @llvm.vp.fmul(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+  %V4F32_VP = call <4 x float> @llvm.vp.fmul(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+  %V8F32_VP = call <8 x float> @llvm.vp.fmul(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+  %V16F32_VP = call <16 x float> @llvm.vp.fmul(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+
+  %V1F64_VP = call <1 x double> @llvm.vp.fmul(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+  %V2F64_VP = call <2 x double> @llvm.vp.fmul(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+  %V4F64_VP = call <4 x double> @llvm.vp.fmul(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+  %V8F64_VP = call <8 x double> @llvm.vp.fmul(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+
+  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fmul(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fmul(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fmul(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fmul(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fmul(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+
+  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fmul(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fmul(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fmul(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fmul(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+
+  ret void
+}
+
+define void @fmul_bf16() {
+; ZVFH-LABEL: 'fmul_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fmul bfloat poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fmul <1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fmul <2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fmul <4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fmul <8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fmul <16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32BF16 = fmul <32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fmul <vscale x 1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fmul <vscale x 2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fmul <vscale x 32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fmul.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fmul.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fmul.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fmul.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fmul.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fmul.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fmul.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fmul.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fmul.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fmul.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fmul_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fmul bfloat poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fmul <1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fmul <2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fmul <4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fmul <8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fmul <16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32BF16 = fmul <32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fmul <vscale x 1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fmul <vscale x 2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fmul <vscale x 32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fmul.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fmul.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fmul.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fmul.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fmul.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fmul.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fmul.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fmul.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fmul.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fmul.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fmul_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fmul bfloat poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fmul <1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = fmul <2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = fmul <4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = fmul <8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = fmul <16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32BF16 = fmul <32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = fmul <vscale x 1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = fmul <vscale x 2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = fmul <vscale x 32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fmul.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fmul.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fmul.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fmul.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fmul.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fmul.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fmul.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fmul.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fmul.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fmul.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = fmul bfloat poison, poison
+
+  %V1BF16 = fmul <1 x bfloat> poison, poison
+  %V2BF16 = fmul <2 x bfloat> poison, poison
+  %V4BF16 = fmul <4 x bfloat> poison, poison
+  %V8BF16 = fmul <8 x bfloat> poison, poison
+  %V16BF16 = fmul <16 x bfloat> poison, poison
+  %V32BF16 = fmul <32 x bfloat> poison, poison
+
+  %NXV1BF16 = fmul <vscale x 1 x bfloat> poison, poison
+  %NXV2BF16 = fmul <vscale x 2 x bfloat> poison, poison
+  %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+  %NXV32BF16 = fmul <vscale x 32 x bfloat> poison, poison
+
+  %V1BF16_VP = call <1 x bfloat> @llvm.vp.fmul(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+  %V2BF16_VP = call <2 x bfloat> @llvm.vp.fmul(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+  %V4BF16_VP = call <4 x bfloat> @llvm.vp.fmul(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+  %V8BF16_VP = call <8 x bfloat> @llvm.vp.fmul(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+  %V16BF16_VP = call <16 x bfloat> @llvm.vp.fmul(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fmul(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fmul(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fmul(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fmul(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fmul(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fmul_f16() {
 ; ZVFH-LABEL: 'fmul_f16'
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fmul <2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul <4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul <8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul <16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fmul <32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fmul.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fmul <2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul <4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul <8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul <16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fmul <32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fmul <vscale x 1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fmul <vscale x 2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fmul <vscale x 4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fmul <vscale x 8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fmul <vscale x 16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fmul.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; ZVFHMIN-LABEL: 'fmul_f16'
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fmul <1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fmul <2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fmul <4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fmul <8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fmul <16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fmul <32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fmul <vscale x 1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fmul <vscale x 2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fmul <vscale x 4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fmul <vscale x 8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fmul <vscale x 16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fmul.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fmul <1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fmul <2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fmul <4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fmul <8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fmul <16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fmul <32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fmul <vscale x 1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fmul <vscale x 2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fmul <vscale x 4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fmul <vscale x 8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fmul <vscale x 16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fmul <vscale x 32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fmul.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = fmul half undef, undef
-
-  %V1F16 = fmul <1 x half> undef, undef
-  %V2F16 = fmul <2 x half> undef, undef
-  %V4F16 = fmul <4 x half> undef, undef
-  %V8F16 = fmul <8 x half> undef, undef
-  %V16F16 = fmul <16 x half> undef, undef
-  %V32F16 = fmul <32 x half> undef, undef
-
-  %NXV1F16 = fmul <vscale x 1 x half> undef, undef
-  %NXV2F16 = fmul <vscale x 2 x half> undef, undef
-  %NXV4F16 = fmul <vscale x 4 x half> undef, undef
-  %NXV8F16 = fmul <vscale x 8 x half> undef, undef
-  %NXV16F16 = fmul <vscale x 16 x half> undef, undef
-  %NXV32F16 = fmul <vscale x 32 x half> undef, undef
-
-  %V1F16_VP = call <1 x half> @llvm.vp.fmul.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; NO-ZFHMIN-LABEL: 'fmul_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fmul half poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fmul <1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fmul <2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = fmul <4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = fmul <8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = fmul <16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = fmul <32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = fmul <vscale x 1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = fmul <vscale x 2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = fmul <vscale x 4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = fmul <vscale x 8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = fmul <vscale x 16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = fmul <vscale x 32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fmul.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fmul.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fmul.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fmul.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fmul.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fmul.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fmul.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fmul.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fmul.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fmul.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %F16 = fmul half poison, poison
+
+  %V1F16 = fmul <1 x half> poison, poison
+  %V2F16 = fmul <2 x half> poison, poison
+  %V4F16 = fmul <4 x half> poison, poison
+  %V8F16 = fmul <8 x half> poison, poison
+  %V16F16 = fmul <16 x half> poison, poison
+  %V32F16 = fmul <32 x half> poison, poison
+
+  %NXV1F16 = fmul <vscale x 1 x half> poison, poison
+  %NXV2F16 = fmul <vscale x 2 x half> poison, poison
+  %NXV4F16 = fmul <vscale x 4 x half> poison, poison
+  %NXV8F16 = fmul <vscale x 8 x half> poison, poison
+  %NXV16F16 = fmul <vscale x 16 x half> poison, poison
+  %NXV32F16 = fmul <vscale x 32 x half> poison, poison
+
+  %V1F16_VP = call <1 x half> @llvm.vp.fmul(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+  %V2F16_VP = call <2 x half> @llvm.vp.fmul(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+  %V4F16_VP = call <4 x half> @llvm.vp.fmul(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+  %V8F16_VP = call <8 x half> @llvm.vp.fmul(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+  %V16F16_VP = call <16 x half> @llvm.vp.fmul(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fmul(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fmul(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fmul(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fmul(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fmul(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fdiv() {
 ; CHECK-LABEL: 'fdiv'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fdiv bfloat undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fdiv <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fdiv <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fdiv <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fdiv <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fdiv <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fdiv <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fdiv <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fdiv <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fdiv <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fdiv <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fdiv <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fdiv <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fdiv <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fdiv <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x bfloat> @llvm.vp.fdiv.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x bfloat> @llvm.vp.fdiv.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x bfloat> @llvm.vp.fdiv.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x bfloat> @llvm.vp.fdiv.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x bfloat> @llvm.vp.fdiv.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fdiv.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fdiv.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fdiv.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fdiv.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fdiv.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fdiv.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fdiv.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fdiv.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fdiv.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fdiv.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fdiv.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fdiv.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fdiv.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fdiv.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fdiv.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fdiv.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fdiv <1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv <2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv <4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fdiv <16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fdiv <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fdiv <1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fdiv <2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fdiv <8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fdiv <vscale x 8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fdiv.v1f32(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fdiv.v2f32(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fdiv.v8f32(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fdiv.v16f32(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fdiv.v1f64(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fdiv.v2f64(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fdiv.v4f64(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fdiv.v8f64(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fdiv.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fdiv.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fdiv.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fdiv.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fdiv.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fdiv.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fdiv.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fdiv.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %BF16 = fdiv bfloat undef, undef
-  %F32 = fdiv float undef, undef
-  %F64 = fdiv double undef, undef
-
-  %V1BF16 = fdiv <1 x bfloat> undef, undef
-  %V2BF16 = fdiv <2 x bfloat> undef, undef
-  %V4BF16 = fdiv <4 x bfloat> undef, undef
-  %V8BF16 = fdiv <8 x bfloat> undef, undef
-  %V16BF16 = fdiv <16 x bfloat> undef, undef
-
-  %NXV1BF16 = fdiv <vscale x 1 x bfloat> undef, undef
-  %NXV2BF16 = fdiv <vscale x 2 x bfloat> undef, undef
-  %NXV4BF16 = fdiv <vscale x 4 x bfloat> undef, undef
-  %NXV8BF16 = fdiv <vscale x 8 x bfloat> undef, undef
-  %NXV16BF16 = fdiv <vscale x 16 x bfloat> undef, undef
-
-  %V1F32 = fdiv <1 x float> undef, undef
-  %V2F32 = fdiv <2 x float> undef, undef
-  %V4F32 = fdiv <4 x float> undef, undef
-  %V8F32 = fdiv <8 x float> undef, undef
-  %V16F32 = fdiv <16 x float> undef, undef
-
-  %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
-  %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
-  %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
-  %NXV8F32 = fdiv <vscale x 8 x float> undef, undef
-  %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
-
-  %V1F64 = fdiv <1 x double> undef, undef
-  %V2F64 = fdiv <2 x double> undef, undef
-  %V4F64 = fdiv <4 x double> undef, undef
-  %V8F64 = fdiv <8 x double> undef, undef
-
-  %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
-  %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
-  %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
-  %NXV8F64 = fdiv <vscale x 8 x double> undef, undef
-
-  %V1F16_VP = call <1 x bfloat> @llvm.vp.fdiv.v1f16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x bfloat> @llvm.vp.fdiv.v2f16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x bfloat> @llvm.vp.fdiv.v4f16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x bfloat> @llvm.vp.fdiv.v8f16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x bfloat> @llvm.vp.fdiv.v16f16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-
-  %V1F32_VP = call <1 x float> @llvm.vp.fdiv.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-  %V2F32_VP = call <2 x float> @llvm.vp.fdiv.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-  %V4F32_VP = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-  %V8F32_VP = call <8 x float> @llvm.vp.fdiv.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-  %V16F32_VP = call <16 x float> @llvm.vp.fdiv.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-
-  %V1F64_VP = call <1 x double> @llvm.vp.fdiv.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-  %V2F64_VP = call <2 x double> @llvm.vp.fdiv.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-  %V4F64_VP = call <4 x double> @llvm.vp.fdiv.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-  %V8F64_VP = call <8 x double> @llvm.vp.fdiv.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fdiv.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fdiv.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fdiv.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fdiv.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fdiv.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fdiv.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fdiv.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fdiv.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %F32 = fdiv float poison, poison
+  %F64 = fdiv double poison, poison
+
+  %V1F32 = fdiv <1 x float> poison, poison
+  %V2F32 = fdiv <2 x float> poison, poison
+  %V4F32 = fdiv <4 x float> poison, poison
+  %V8F32 = fdiv <8 x float> poison, poison
+  %V16F32 = fdiv <16 x float> poison, poison
+
+  %NXV1F32 = fdiv <vscale x 1 x float> poison, poison
+  %NXV2F32 = fdiv <vscale x 2 x float> poison, poison
+  %NXV4F32 = fdiv <vscale x 4 x float> poison, poison
+  %NXV8F32 = fdiv <vscale x 8 x float> poison, poison
+  %NXV16F32 = fdiv <vscale x 16 x float> poison, poison
+
+  %V1F64 = fdiv <1 x double> poison, poison
+  %V2F64 = fdiv <2 x double> poison, poison
+  %V4F64 = fdiv <4 x double> poison, poison
+  %V8F64 = fdiv <8 x double> poison, poison
+
+  %NXV1F64 = fdiv <vscale x 1 x double> poison, poison
+  %NXV2F64 = fdiv <vscale x 2 x double> poison, poison
+  %NXV4F64 = fdiv <vscale x 4 x double> poison, poison
+  %NXV8F64 = fdiv <vscale x 8 x double> poison, poison
+
+  %V1F32_VP = call <1 x float> @llvm.vp.fdiv(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+  %V2F32_VP = call <2 x float> @llvm.vp.fdiv(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+  %V4F32_VP = call <4 x float> @llvm.vp.fdiv(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+  %V8F32_VP = call <8 x float> @llvm.vp.fdiv(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+  %V16F32_VP = call <16 x float> @llvm.vp.fdiv(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+
+  %V1F64_VP = call <1 x double> @llvm.vp.fdiv(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+  %V2F64_VP = call <2 x double> @llvm.vp.fdiv(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+  %V4F64_VP = call <4 x double> @llvm.vp.fdiv(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+  %V8F64_VP = call <8 x double> @llvm.vp.fdiv(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+
+
+  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fdiv(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fdiv(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fdiv(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fdiv(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fdiv(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+
+  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fdiv(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fdiv(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fdiv(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fdiv(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+
+  ret void
+}
+
+define void @fdiv_bf16() {
+; ZVFH-LABEL: 'fdiv_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fdiv bfloat poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fdiv <1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fdiv <2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fdiv <4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fdiv <8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fdiv <16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32BF16 = fdiv <32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fdiv <vscale x 1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fdiv <vscale x 2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fdiv <vscale x 32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fdiv.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fdiv.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fdiv.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fdiv.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fdiv.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fdiv_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fdiv bfloat poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fdiv <1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fdiv <2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fdiv <4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fdiv <8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fdiv <16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32BF16 = fdiv <32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fdiv <vscale x 1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fdiv <vscale x 2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fdiv <vscale x 32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fdiv.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fdiv.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fdiv.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fdiv.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fdiv.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fdiv_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fdiv bfloat poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fdiv <1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = fdiv <2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = fdiv <4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = fdiv <8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = fdiv <16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32BF16 = fdiv <32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = fdiv <vscale x 1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = fdiv <vscale x 2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = fdiv <vscale x 32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fdiv.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fdiv.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fdiv.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fdiv.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fdiv.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fdiv.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fdiv.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fdiv.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fdiv.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fdiv.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = fdiv bfloat poison, poison
+
+  %V1BF16 = fdiv <1 x bfloat> poison, poison
+  %V2BF16 = fdiv <2 x bfloat> poison, poison
+  %V4BF16 = fdiv <4 x bfloat> poison, poison
+  %V8BF16 = fdiv <8 x bfloat> poison, poison
+  %V16BF16 = fdiv <16 x bfloat> poison, poison
+  %V32BF16 = fdiv <32 x bfloat> poison, poison
+
+  %NXV1BF16 = fdiv <vscale x 1 x bfloat> poison, poison
+  %NXV2BF16 = fdiv <vscale x 2 x bfloat> poison, poison
+  %NXV4BF16 = fdiv <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = fdiv <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = fdiv <vscale x 16 x bfloat> poison, poison
+  %NXV32BF16 = fdiv <vscale x 32 x bfloat> poison, poison
+
+  %V1BF16_VP = call <1 x bfloat> @llvm.vp.fdiv(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+  %V2BF16_VP = call <2 x bfloat> @llvm.vp.fdiv(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+  %V4BF16_VP = call <4 x bfloat> @llvm.vp.fdiv(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+  %V8BF16_VP = call <8 x bfloat> @llvm.vp.fdiv(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+  %V16BF16_VP = call <16 x bfloat> @llvm.vp.fdiv(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fdiv(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fdiv(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fdiv(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fdiv(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fdiv(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fdiv_f16() {
 ; ZVFH-LABEL: 'fdiv_f16'
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fdiv <2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv <4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fdiv <32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fdiv.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fdiv <2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv <4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv <8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fdiv <32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fdiv.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; ZVFHMIN-LABEL: 'fdiv_f16'
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fdiv <1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fdiv <2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fdiv <4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fdiv <32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fdiv.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1F16 = fdiv <1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F16 = fdiv <2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4F16 = fdiv <4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8F16 = fdiv <8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16F16 = fdiv <16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32F16 = fdiv <32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8F16 = fdiv <vscale x 8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %NXV16F16 = fdiv <vscale x 16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fdiv <vscale x 32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fdiv.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = fdiv half undef, undef
-
-  %V1F16 = fdiv <1 x half> undef, undef
-  %V2F16 = fdiv <2 x half> undef, undef
-  %V4F16 = fdiv <4 x half> undef, undef
-  %V8F16 = fdiv <8 x half> undef, undef
-  %V16F16 = fdiv <16 x half> undef, undef
-  %V32F16 = fdiv <32 x half> undef, undef
-
-  %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
-  %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
-  %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
-  %NXV8F16 = fdiv <vscale x 8 x half> undef, undef
-  %NXV16F16 = fdiv <vscale x 16 x half> undef, undef
-  %NXV32F16 = fdiv <vscale x 32 x half> undef, undef
-
-  %V1F16_VP = call <1 x half> @llvm.vp.fdiv.v1f16(<1 x half> undef, <1 x half> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> undef, <2 x half> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> undef, <4 x half> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> undef, <8 x half> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> undef, <16 x half> undef, <16 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; NO-ZFHMIN-LABEL: 'fdiv_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fdiv half poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fdiv <1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fdiv <2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = fdiv <4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = fdiv <8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = fdiv <16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = fdiv <32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = fdiv <vscale x 1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = fdiv <vscale x 2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = fdiv <vscale x 4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = fdiv <vscale x 8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = fdiv <vscale x 16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = fdiv <vscale x 32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fdiv.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fdiv.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fdiv.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fdiv.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fdiv.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fdiv.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fdiv.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fdiv.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fdiv.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fdiv.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %F16 = fdiv half poison, poison
+
+  %V1F16 = fdiv <1 x half> poison, poison
+  %V2F16 = fdiv <2 x half> poison, poison
+  %V4F16 = fdiv <4 x half> poison, poison
+  %V8F16 = fdiv <8 x half> poison, poison
+  %V16F16 = fdiv <16 x half> poison, poison
+  %V32F16 = fdiv <32 x half> poison, poison
+
+  %NXV1F16 = fdiv <vscale x 1 x half> poison, poison
+  %NXV2F16 = fdiv <vscale x 2 x half> poison, poison
+  %NXV4F16 = fdiv <vscale x 4 x half> poison, poison
+  %NXV8F16 = fdiv <vscale x 8 x half> poison, poison
+  %NXV16F16 = fdiv <vscale x 16 x half> poison, poison
+  %NXV32F16 = fdiv <vscale x 32 x half> poison, poison
+
+  %V1F16_VP = call <1 x half> @llvm.vp.fdiv(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+  %V2F16_VP = call <2 x half> @llvm.vp.fdiv(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+  %V4F16_VP = call <4 x half> @llvm.vp.fdiv(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+  %V8F16_VP = call <8 x half> @llvm.vp.fdiv(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+  %V16F16_VP = call <16 x half> @llvm.vp.fdiv(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fdiv(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fdiv(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fdiv(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fdiv(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fdiv(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @frem() {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = frem bfloat undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16 = frem <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2BF16 = frem <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16BF16 = frem <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = frem <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F32 = frem <16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32 = frem <vscale x 1 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F32 = frem <vscale x 2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F32 = frem <vscale x 4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F32 = frem <vscale x 8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32 = frem <vscale x 16 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = frem <1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F64 = frem <8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64 = frem <vscale x 1 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F64 = frem <vscale x 2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F64 = frem <vscale x 4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F64 = frem <vscale x 8 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F16_VP = call <1 x bfloat> @llvm.vp.frem.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F16_VP = call <2 x bfloat> @llvm.vp.frem.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F16_VP = call <4 x bfloat> @llvm.vp.frem.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F16_VP = call <8 x bfloat> @llvm.vp.frem.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16F16_VP = call <16 x bfloat> @llvm.vp.frem.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.frem.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.frem.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.frem.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.frem.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.frem.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.frem.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.frem.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.frem.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.frem.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.frem.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.frem.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.frem.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.frem.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.frem.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.frem.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.frem.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.frem.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.frem.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.frem.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.frem.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.frem.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = frem <8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F32 = frem <16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32 = frem <vscale x 1 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F32 = frem <vscale x 2 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F32 = frem <vscale x 4 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F32 = frem <vscale x 8 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32 = frem <vscale x 16 x float> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = frem <1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F64 = frem <8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64 = frem <vscale x 1 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F64 = frem <vscale x 2 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F64 = frem <vscale x 4 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F64 = frem <vscale x 8 x double> poison, poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.frem.v1f32(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.frem.v2f32(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.frem.v8f32(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.frem.v16f32(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.frem.v1f64(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.frem.v2f64(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.frem.v4f64(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.frem.v8f64(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.frem.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.frem.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.frem.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.frem.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.frem.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.frem.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.frem.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.frem.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %BF16 = frem bfloat undef, undef
-  %F32 = frem float undef, undef
-  %F64 = frem double undef, undef
-
-  %V1BF16 = frem <1 x bfloat> undef, undef
-  %V2BF16 = frem <2 x bfloat> undef, undef
-  %V4BF16 = frem <4 x bfloat> undef, undef
-  %V8BF16 = frem <8 x bfloat> undef, undef
-  %V16BF16 = frem <16 x bfloat> undef, undef
-
-  %NXV1BF16 = frem <vscale x 1 x bfloat> undef, undef
-  %NXV2BF16 = frem <vscale x 2 x bfloat> undef, undef
-  %NXV4BF16 = frem <vscale x 4 x bfloat> undef, undef
-  %NXV8BF16 = frem <vscale x 8 x bfloat> undef, undef
-  %NXV16BF16 = frem <vscale x 16 x bfloat> undef, undef
-
-  %V1F32 = frem <1 x float> undef, undef
-  %V2F32 = frem <2 x float> undef, undef
-  %V4F32 = frem <4 x float> undef, undef
-  %V8F32 = frem <8 x float> undef, undef
-  %V16F32 = frem <16 x float> undef, undef
-
-  %NXV1F32 = frem <vscale x 1 x float> undef, undef
-  %NXV2F32 = frem <vscale x 2 x float> undef, undef
-  %NXV4F32 = frem <vscale x 4 x float> undef, undef
-  %NXV8F32 = frem <vscale x 8 x float> undef, undef
-  %NXV16F32 = frem <vscale x 16 x float> undef, undef
-
-  %V1F64 = frem <1 x double> undef, undef
-  %V2F64 = frem <2 x double> undef, undef
-  %V4F64 = frem <4 x double> undef, undef
-  %V8F64 = frem <8 x double> undef, undef
-
-  %NXV1F64 = frem <vscale x 1 x double> undef, undef
-  %NXV2F64 = frem <vscale x 2 x double> undef, undef
-  %NXV4F64 = frem <vscale x 4 x double> undef, undef
-  %NXV8F64 = frem <vscale x 8 x double> undef, undef
-
-  %V1F16_VP = call <1 x bfloat> @llvm.vp.frem.v1f16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x i1> undef, i32 undef)
-  %V2F16_VP = call <2 x bfloat> @llvm.vp.frem.v2f16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x i1> undef, i32 undef)
-  %V4F16_VP = call <4 x bfloat> @llvm.vp.frem.v4f16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x i1> undef, i32 undef)
-  %V8F16_VP = call <8 x bfloat> @llvm.vp.frem.v8f16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x i1> undef, i32 undef)
-  %V16F16_VP = call <16 x bfloat> @llvm.vp.frem.v16f16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x i1> undef, i32 undef)
-
-  %V1F32_VP = call <1 x float> @llvm.vp.frem.v1f32(<1 x float> undef, <1 x float> undef, <1 x i1> undef, i32 undef)
-  %V2F32_VP = call <2 x float> @llvm.vp.frem.v2f32(<2 x float> undef, <2 x float> undef, <2 x i1> undef, i32 undef)
-  %V4F32_VP = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> undef, <4 x float> undef, <4 x i1> undef, i32 undef)
-  %V8F32_VP = call <8 x float> @llvm.vp.frem.v8f32(<8 x float> undef, <8 x float> undef, <8 x i1> undef, i32 undef)
-  %V16F32_VP = call <16 x float> @llvm.vp.frem.v16f32(<16 x float> undef, <16 x float> undef, <16 x i1> undef, i32 undef)
-
-  %V1F64_VP = call <1 x double> @llvm.vp.frem.v1f64(<1 x double> undef, <1 x double> undef, <1 x i1> undef, i32 undef)
-  %V2F64_VP = call <2 x double> @llvm.vp.frem.v2f64(<2 x double> undef, <2 x double> undef, <2 x i1> undef, i32 undef)
-  %V4F64_VP = call <4 x double> @llvm.vp.frem.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> undef, i32 undef)
-  %V8F64_VP = call <8 x double> @llvm.vp.frem.v8f64(<8 x double> undef, <8 x double> undef, <8 x i1> undef, i32 undef)
-
-  %NXV1F16_VP = call <vscale x 1 x bfloat> @llvm.vp.frem.nxv1f16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F16_VP = call <vscale x 2 x bfloat> @llvm.vp.frem.nxv2f16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F16_VP = call <vscale x 4 x bfloat> @llvm.vp.frem.nxv4f16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F16_VP = call <vscale x 8 x bfloat> @llvm.vp.frem.nxv8f16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F16_VP = call <vscale x 16 x bfloat> @llvm.vp.frem.nxv16f16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.frem.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.frem.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.frem.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.frem.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.frem.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.frem.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.frem.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.frem.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %F32 = frem float poison, poison
+  %F64 = frem double poison, poison
+
+  %V1F32 = frem <1 x float> poison, poison
+  %V2F32 = frem <2 x float> poison, poison
+  %V4F32 = frem <4 x float> poison, poison
+  %V8F32 = frem <8 x float> poison, poison
+  %V16F32 = frem <16 x float> poison, poison
+
+  %NXV1F32 = frem <vscale x 1 x float> poison, poison
+  %NXV2F32 = frem <vscale x 2 x float> poison, poison
+  %NXV4F32 = frem <vscale x 4 x float> poison, poison
+  %NXV8F32 = frem <vscale x 8 x float> poison, poison
+  %NXV16F32 = frem <vscale x 16 x float> poison, poison
+
+  %V1F64 = frem <1 x double> poison, poison
+  %V2F64 = frem <2 x double> poison, poison
+  %V4F64 = frem <4 x double> poison, poison
+  %V8F64 = frem <8 x double> poison, poison
+
+  %NXV1F64 = frem <vscale x 1 x double> poison, poison
+  %NXV2F64 = frem <vscale x 2 x double> poison, poison
+  %NXV4F64 = frem <vscale x 4 x double> poison, poison
+  %NXV8F64 = frem <vscale x 8 x double> poison, poison
+
+  %V1F32_VP = call <1 x float> @llvm.vp.frem(<1 x float> poison, <1 x float> poison, <1 x i1> poison, i32 poison)
+  %V2F32_VP = call <2 x float> @llvm.vp.frem(<2 x float> poison, <2 x float> poison, <2 x i1> poison, i32 poison)
+  %V4F32_VP = call <4 x float> @llvm.vp.frem(<4 x float> poison, <4 x float> poison, <4 x i1> poison, i32 poison)
+  %V8F32_VP = call <8 x float> @llvm.vp.frem(<8 x float> poison, <8 x float> poison, <8 x i1> poison, i32 poison)
+  %V16F32_VP = call <16 x float> @llvm.vp.frem(<16 x float> poison, <16 x float> poison, <16 x i1> poison, i32 poison)
+
+  %V1F64_VP = call <1 x double> @llvm.vp.frem(<1 x double> poison, <1 x double> poison, <1 x i1> poison, i32 poison)
+  %V2F64_VP = call <2 x double> @llvm.vp.frem(<2 x double> poison, <2 x double> poison, <2 x i1> poison, i32 poison)
+  %V4F64_VP = call <4 x double> @llvm.vp.frem(<4 x double> poison, <4 x double> poison, <4 x i1> poison, i32 poison)
+  %V8F64_VP = call <8 x double> @llvm.vp.frem(<8 x double> poison, <8 x double> poison, <8 x i1> poison, i32 poison)
+
+  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.frem(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.frem(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.frem(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.frem(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.frem(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+
+  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.frem(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.frem(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.frem(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.frem(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+
+  ret void
+}
+
+define void @frem_bf16() {
+; ZVFH-LABEL: 'frem_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = frem bfloat poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16 = frem <1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2BF16 = frem <2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16BF16 = frem <16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32BF16 = frem <32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = frem <vscale x 32 x bfloat> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.frem.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.frem.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.frem.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.frem.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.frem.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.frem.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.frem.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.frem.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.frem.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.frem.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'frem_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = frem bfloat poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16 = frem <1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2BF16 = frem <2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16BF16 = frem <16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32BF16 = frem <32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = frem <vscale x 32 x bfloat> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.frem.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.frem.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.frem.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.frem.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.frem.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.frem.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.frem.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.frem.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.frem.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.frem.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'frem_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = frem bfloat poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = frem <1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = frem <2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = frem <4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = frem <8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = frem <16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32BF16 = frem <32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = frem <vscale x 32 x bfloat> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.frem.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.frem.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.frem.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.frem.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.frem.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.frem.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.frem.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.frem.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.frem.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.frem.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = frem bfloat poison, poison
+
+  %V1BF16 = frem <1 x bfloat> poison, poison
+  %V2BF16 = frem <2 x bfloat> poison, poison
+  %V4BF16 = frem <4 x bfloat> poison, poison
+  %V8BF16 = frem <8 x bfloat> poison, poison
+  %V16BF16 = frem <16 x bfloat> poison, poison
+  %V32BF16 = frem <32 x bfloat> poison, poison
+
+  %NXV1BF16 = frem <vscale x 1 x bfloat> poison, poison
+  %NXV2BF16 = frem <vscale x 2 x bfloat> poison, poison
+  %NXV4BF16 = frem <vscale x 4 x bfloat> poison, poison
+  %NXV8BF16 = frem <vscale x 8 x bfloat> poison, poison
+  %NXV16BF16 = frem <vscale x 16 x bfloat> poison, poison
+  %NXV32BF16 = frem <vscale x 32 x bfloat> poison, poison
+
+  %V1BF16_VP = call <1 x bfloat> @llvm.vp.frem(<1 x bfloat> poison, <1 x bfloat> poison, <1 x i1> poison, i32 poison)
+  %V2BF16_VP = call <2 x bfloat> @llvm.vp.frem(<2 x bfloat> poison, <2 x bfloat> poison, <2 x i1> poison, i32 poison)
+  %V4BF16_VP = call <4 x bfloat> @llvm.vp.frem(<4 x bfloat> poison, <4 x bfloat> poison, <4 x i1> poison, i32 poison)
+  %V8BF16_VP = call <8 x bfloat> @llvm.vp.frem(<8 x bfloat> poison, <8 x bfloat> poison, <8 x i1> poison, i32 poison)
+  %V16BF16_VP = call <16 x bfloat> @llvm.vp.frem(<16 x bfloat> poison, <16 x bfloat> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.frem(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.frem(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.frem(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.frem(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.frem(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @frem_f16() {
-; CHECK-LABEL: 'frem_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'frem_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> poison, poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.frem.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.frem.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.frem.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.frem.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.frem.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.frem.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.frem.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.frem.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.frem.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.frem.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'frem_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> poison, poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.frem.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.frem.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.frem.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.frem.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.frem.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.frem.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.frem.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.frem.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.frem.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.frem.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = frem half undef, undef
-
-  %V1F16 = frem <1 x half> undef, undef
-  %V2F16 = frem <2 x half> undef, undef
-  %V4F16 = frem <4 x half> undef, undef
-  %V8F16 = frem <8 x half> undef, undef
-  %V16F16 = frem <16 x half> undef, undef
-  %V32F16 = frem <32 x half> undef, undef
-
-  %NXV1F16 = frem <vscale x 1 x half> undef, undef
-  %NXV2F16 = frem <vscale x 2 x half> undef, undef
-  %NXV4F16 = frem <vscale x 4 x half> undef, undef
-  %NXV8F16 = frem <vscale x 8 x half> undef, undef
-  %NXV16F16 = frem <vscale x 16 x half> undef, undef
-  %NXV32F16 = frem <vscale x 32 x half> undef, undef
+; NO-ZFHMIN-LABEL: 'frem_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = frem <1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = frem <2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = frem <4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = frem <8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = frem <16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = frem <32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> poison, poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.frem.v1f16(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.frem.v2f16(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.frem.v4f16(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.frem.v8f16(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.frem.v16f16(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.frem.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.frem.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.frem.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.frem.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.frem.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %F16 = frem half poison, poison
+
+  %V1F16 = frem <1 x half> poison, poison
+  %V2F16 = frem <2 x half> poison, poison
+  %V4F16 = frem <4 x half> poison, poison
+  %V8F16 = frem <8 x half> poison, poison
+  %V16F16 = frem <16 x half> poison, poison
+  %V32F16 = frem <32 x half> poison, poison
+
+  %NXV1F16 = frem <vscale x 1 x half> poison, poison
+  %NXV2F16 = frem <vscale x 2 x half> poison, poison
+  %NXV4F16 = frem <vscale x 4 x half> poison, poison
+  %NXV8F16 = frem <vscale x 8 x half> poison, poison
+  %NXV16F16 = frem <vscale x 16 x half> poison, poison
+  %NXV32F16 = frem <vscale x 32 x half> poison, poison
+
+  %V1F16_VP = call <1 x half> @llvm.vp.frem(<1 x half> poison, <1 x half> poison, <1 x i1> poison, i32 poison)
+  %V2F16_VP = call <2 x half> @llvm.vp.frem(<2 x half> poison, <2 x half> poison, <2 x i1> poison, i32 poison)
+  %V4F16_VP = call <4 x half> @llvm.vp.frem(<4 x half> poison, <4 x half> poison, <4 x i1> poison, i32 poison)
+  %V8F16_VP = call <8 x half> @llvm.vp.frem(<8 x half> poison, <8 x half> poison, <8 x i1> poison, i32 poison)
+  %V16F16_VP = call <16 x half> @llvm.vp.frem(<16 x half> poison, <16 x half> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.frem(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.frem(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.frem(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.frem(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.frem(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fneg() {
 ; CHECK-LABEL: 'fneg'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fneg half undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fneg float undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fneg double undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fneg <1 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fneg <2 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fneg <4 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fneg <8 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fneg <16 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fneg <vscale x 1 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fneg <vscale x 2 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fneg <vscale x 4 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fneg <vscale x 8 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fneg <vscale x 16 x bfloat> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fneg <1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fneg <8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fneg <16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fneg <vscale x 1 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fneg <vscale x 2 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fneg <vscale x 4 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fneg <vscale x 8 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fneg <vscale x 16 x float> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fneg <1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fneg <2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fneg <4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fneg <8 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fneg <vscale x 1 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fneg <vscale x 2 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fneg <vscale x 4 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fneg <vscale x 8 x double> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fneg.v1bf16(<1 x bfloat> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fneg.v1f32(<1 x float> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fneg.v1f64(<1 x double> undef, <1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64_VP = call <16 x double> @llvm.vp.fneg.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF32_VP = call <vscale x 1 x float> @llvm.vp.fneg.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF32_VP = call <vscale x 2 x float> @llvm.vp.fneg.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4BF32_VP = call <vscale x 4 x float> @llvm.vp.fneg.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8BF32_VP = call <vscale x 8 x float> @llvm.vp.fneg.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16BF32_VP = call <vscale x 16 x float> @llvm.vp.fneg.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF64_VP = call <vscale x 1 x double> @llvm.vp.fneg.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2BF64_VP = call <vscale x 2 x double> @llvm.vp.fneg.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4BF64_VP = call <vscale x 4 x double> @llvm.vp.fneg.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8BF64_VP = call <vscale x 8 x double> @llvm.vp.fneg.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %NXV16BF64_VP = call <vscale x 16 x double> @llvm.vp.fneg.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fneg float poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fneg double poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = fneg <1 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fneg <2 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fneg <4 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fneg <8 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = fneg <16 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fneg <vscale x 1 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fneg <vscale x 2 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32 = fneg <vscale x 4 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32 = fneg <vscale x 8 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32 = fneg <vscale x 16 x float> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = fneg <1 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fneg <2 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fneg <4 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fneg <8 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fneg <vscale x 1 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64 = fneg <vscale x 2 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64 = fneg <vscale x 4 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64 = fneg <vscale x 8 x double> poison
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32_VP = call <1 x float> @llvm.vp.fneg.v1f32(<1 x float> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32_VP = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32_VP = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32_VP = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32_VP = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64_VP = call <1 x double> @llvm.vp.fneg.v1f64(<1 x double> poison, <1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64_VP = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64_VP = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64_VP = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F64_VP = call <16 x double> @llvm.vp.fneg.v16f64(<16 x double> poison, <16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fneg.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fneg.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fneg.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fneg.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fneg.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fneg.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fneg.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fneg.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fneg.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %NXV16F64_VP = call <vscale x 16 x double> @llvm.vp.fneg.nxv16f64(<vscale x 16 x double> poison, <vscale x 16 x i1> poison, i32 poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %BF16 = fneg half undef
-  %F32 = fneg float undef
-  %F64 = fneg double undef
-
-  %V1BF16 = fneg <1 x bfloat> undef
-  %V2BF16 = fneg <2 x bfloat> undef
-  %V4BF16 = fneg <4 x bfloat> undef
-  %V8BF16 = fneg <8 x bfloat> undef
-  %V16BF16 = fneg <16 x bfloat> undef
-
-  %NXV1BF16 = fneg <vscale x 1 x bfloat> undef
-  %NXV2BF16 = fneg <vscale x 2 x bfloat> undef
-  %NXV4BF16 = fneg <vscale x 4 x bfloat> undef
-  %NXV8BF16 = fneg <vscale x 8 x bfloat> undef
-  %NXV16BF16 = fneg <vscale x 16 x bfloat> undef
-
-  %V1F32 = fneg <1 x float> undef
-  %V2F32 = fneg <2 x float> undef
-  %V4F32 = fneg <4 x float> undef
-  %V8F32 = fneg <8 x float> undef
-  %V16F32 = fneg <16 x float> undef
-
-  %NXV1F32 = fneg <vscale x 1 x float> undef
-  %NXV2F32 = fneg <vscale x 2 x float> undef
-  %NXV4F32 = fneg <vscale x 4 x float> undef
-  %NXV8F32 = fneg <vscale x 8 x float> undef
-  %NXV16F32 = fneg <vscale x 16 x float> undef
-
-  %V1F64 = fneg <1 x double> undef
-  %V2F64 = fneg <2 x double> undef
-  %V4F64 = fneg <4 x double> undef
-  %V8F64 = fneg <8 x double> undef
-
-  %NXV1F64 = fneg <vscale x 1 x double> undef
-  %NXV2F64 = fneg <vscale x 2 x double> undef
-  %NXV4F64 = fneg <vscale x 4 x double> undef
-  %NXV8F64 = fneg <vscale x 8 x double> undef
-
-  %V1BF16_VP = call <1 x bfloat> @llvm.vp.fneg.v1f16(<1 x bfloat> undef, <1 x i1> undef, i32 undef)
-  %V2BF16_VP = call <2 x bfloat> @llvm.vp.fneg.v2f16(<2 x bfloat> undef, <2 x i1> undef, i32 undef)
-  %V4BF16_VP = call <4 x bfloat> @llvm.vp.fneg.v4f16(<4 x bfloat> undef, <4 x i1> undef, i32 undef)
-  %V8BF16_VP = call <8 x bfloat> @llvm.vp.fneg.v8f16(<8 x bfloat> undef, <8 x i1> undef, i32 undef)
-  %V16BF16_VP = call <16 x bfloat> @llvm.vp.fneg.v16f16(<16 x bfloat> undef, <16 x i1> undef, i32 undef)
-
-  %V1F32_VP = call <1 x float> @llvm.vp.fneg.v1f32(<1 x float> undef, <1 x i1> undef, i32 undef)
-  %V2F32_VP = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %V4F32_VP = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %V8F32_VP = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %V16F32_VP = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-
-  %V1F64_VP = call <1 x double> @llvm.vp.fneg.v1f64(<1 x double> undef, <1 x i1> undef, i32 undef)
-  %V2F64_VP = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %V4F64_VP = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %V8F64_VP = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %V16F64_VP = call <16 x double> @llvm.vp.fneg.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-
-  %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1f16(<vscale x 1 x bfloat> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2f16(<vscale x 2 x bfloat> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4f16(<vscale x 4 x bfloat> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8f16(<vscale x 8 x bfloat> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16f16(<vscale x 16 x bfloat> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1BF32_VP = call <vscale x 1 x float> @llvm.vp.fneg.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2BF32_VP = call <vscale x 2 x float> @llvm.vp.fneg.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4BF32_VP = call <vscale x 4 x float> @llvm.vp.fneg.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8BF32_VP = call <vscale x 8 x float> @llvm.vp.fneg.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16BF32_VP = call <vscale x 16 x float> @llvm.vp.fneg.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-
-  %NXV1BF64_VP = call <vscale x 1 x double> @llvm.vp.fneg.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %NXV2BF64_VP = call <vscale x 2 x double> @llvm.vp.fneg.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %NXV4BF64_VP = call <vscale x 4 x double> @llvm.vp.fneg.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %NXV8BF64_VP = call <vscale x 8 x double> @llvm.vp.fneg.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %NXV16BF64_VP = call <vscale x 16 x double> @llvm.vp.fneg.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %F32 = fneg float poison
+  %F64 = fneg double poison
+
+  %V1F32 = fneg <1 x float> poison
+  %V2F32 = fneg <2 x float> poison
+  %V4F32 = fneg <4 x float> poison
+  %V8F32 = fneg <8 x float> poison
+  %V16F32 = fneg <16 x float> poison
+
+  %NXV1F32 = fneg <vscale x 1 x float> poison
+  %NXV2F32 = fneg <vscale x 2 x float> poison
+  %NXV4F32 = fneg <vscale x 4 x float> poison
+  %NXV8F32 = fneg <vscale x 8 x float> poison
+  %NXV16F32 = fneg <vscale x 16 x float> poison
+
+  %V1F64 = fneg <1 x double> poison
+  %V2F64 = fneg <2 x double> poison
+  %V4F64 = fneg <4 x double> poison
+  %V8F64 = fneg <8 x double> poison
+
+  %NXV1F64 = fneg <vscale x 1 x double> poison
+  %NXV2F64 = fneg <vscale x 2 x double> poison
+  %NXV4F64 = fneg <vscale x 4 x double> poison
+  %NXV8F64 = fneg <vscale x 8 x double> poison
+
+  %V1F32_VP = call <1 x float> @llvm.vp.fneg(<1 x float> poison, <1 x i1> poison, i32 poison)
+  %V2F32_VP = call <2 x float> @llvm.vp.fneg(<2 x float> poison, <2 x i1> poison, i32 poison)
+  %V4F32_VP = call <4 x float> @llvm.vp.fneg(<4 x float> poison, <4 x i1> poison, i32 poison)
+  %V8F32_VP = call <8 x float> @llvm.vp.fneg(<8 x float> poison, <8 x i1> poison, i32 poison)
+  %V16F32_VP = call <16 x float> @llvm.vp.fneg(<16 x float> poison, <16 x i1> poison, i32 poison)
+
+  %V1F64_VP = call <1 x double> @llvm.vp.fneg(<1 x double> poison, <1 x i1> poison, i32 poison)
+  %V2F64_VP = call <2 x double> @llvm.vp.fneg(<2 x double> poison, <2 x i1> poison, i32 poison)
+  %V4F64_VP = call <4 x double> @llvm.vp.fneg(<4 x double> poison, <4 x i1> poison, i32 poison)
+  %V8F64_VP = call <8 x double> @llvm.vp.fneg(<8 x double> poison, <8 x i1> poison, i32 poison)
+  %V16F64_VP = call <16 x double> @llvm.vp.fneg(<16 x double> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1F32_VP = call <vscale x 1 x float> @llvm.vp.fneg(<vscale x 1 x float> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F32_VP = call <vscale x 2 x float> @llvm.vp.fneg(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F32_VP = call <vscale x 4 x float> @llvm.vp.fneg(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F32_VP = call <vscale x 8 x float> @llvm.vp.fneg(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F32_VP = call <vscale x 16 x float> @llvm.vp.fneg(<vscale x 16 x float> poison, <vscale x 16 x i1> poison, i32 poison)
+
+  %NXV1F64_VP = call <vscale x 1 x double> @llvm.vp.fneg(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F64_VP = call <vscale x 2 x double> @llvm.vp.fneg(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F64_VP = call <vscale x 4 x double> @llvm.vp.fneg(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F64_VP = call <vscale x 8 x double> @llvm.vp.fneg(<vscale x 8 x double> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F64_VP = call <vscale x 16 x double> @llvm.vp.fneg(<vscale x 16 x double> poison, <vscale x 16 x i1> poison, i32 poison)
+
+  ret void
+}
+
+define void @fneg_bf16() {
+; ZVFH-LABEL: 'fneg_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fneg bfloat poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fneg <1 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fneg <2 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fneg <4 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fneg <8 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fneg <16 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32BF16 = fneg <32 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fneg <vscale x 1 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fneg <vscale x 32 x bfloat> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fneg.v1bf16(<1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fneg_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fneg bfloat poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fneg <1 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fneg <2 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fneg <4 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fneg <8 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fneg <16 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32BF16 = fneg <32 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = fneg <vscale x 1 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32BF16 = fneg <vscale x 32 x bfloat> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fneg.v1bf16(<1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fneg_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fneg bfloat poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fneg <1 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = fneg <2 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = fneg <4 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = fneg <8 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = fneg <16 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32BF16 = fneg <32 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = fneg <vscale x 1 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = fneg <vscale x 32 x bfloat> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16_VP = call <1 x bfloat> @llvm.vp.fneg.v1bf16(<1 x bfloat> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16_VP = call <2 x bfloat> @llvm.vp.fneg.v2bf16(<2 x bfloat> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16_VP = call <4 x bfloat> @llvm.vp.fneg.v4bf16(<4 x bfloat> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16_VP = call <8 x bfloat> @llvm.vp.fneg.v8bf16(<8 x bfloat> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16_VP = call <16 x bfloat> @llvm.vp.fneg.v16bf16(<16 x bfloat> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fneg.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fneg.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fneg.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fneg.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fneg.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = fneg bfloat poison
+
+  %V1BF16 = fneg <1 x bfloat> poison
+  %V2BF16 = fneg <2 x bfloat> poison
+  %V4BF16 = fneg <4 x bfloat> poison
+  %V8BF16 = fneg <8 x bfloat> poison
+  %V16BF16 = fneg <16 x bfloat> poison
+  %V32BF16 = fneg <32 x bfloat> poison
+
+  %NXV1BF16 = fneg <vscale x 1 x bfloat> poison
+  %NXV2BF16 = fneg <vscale x 2 x bfloat> poison
+  %NXV4BF16 = fneg <vscale x 4 x bfloat> poison
+  %NXV8BF16 = fneg <vscale x 8 x bfloat> poison
+  %NXV16BF16 = fneg <vscale x 16 x bfloat> poison
+  %NXV32BF16 = fneg <vscale x 32 x bfloat> poison
+
+  %V1BF16_VP = call <1 x bfloat> @llvm.vp.fneg(<1 x bfloat> poison, <1 x i1> poison, i32 poison)
+  %V2BF16_VP = call <2 x bfloat> @llvm.vp.fneg(<2 x bfloat> poison, <2 x i1> poison, i32 poison)
+  %V4BF16_VP = call <4 x bfloat> @llvm.vp.fneg(<4 x bfloat> poison, <4 x i1> poison, i32 poison)
+  %V8BF16_VP = call <8 x bfloat> @llvm.vp.fneg(<8 x bfloat> poison, <8 x i1> poison, i32 poison)
+  %V16BF16_VP = call <16 x bfloat> @llvm.vp.fneg(<16 x bfloat> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1BF16_VP = call <vscale x 1 x bfloat> @llvm.vp.fneg(<vscale x 1 x bfloat> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2BF16_VP = call <vscale x 2 x bfloat> @llvm.vp.fneg(<vscale x 2 x bfloat> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4BF16_VP = call <vscale x 4 x bfloat> @llvm.vp.fneg(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8BF16_VP = call <vscale x 8 x bfloat> @llvm.vp.fneg(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16BF16_VP = call <vscale x 16 x bfloat> @llvm.vp.fneg(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fneg_f16() {
-; CHECK-LABEL: 'fneg_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fneg <1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fneg <32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fneg <vscale x 1 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fneg <vscale x 2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fneg <vscale x 4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fneg <vscale x 8 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fneg <vscale x 16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fneg <vscale x 32 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fneg_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fneg <1 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <2 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <4 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <8 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <16 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fneg <32 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fneg <vscale x 1 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fneg <vscale x 2 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fneg <vscale x 4 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fneg <vscale x 8 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fneg <vscale x 16 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fneg <vscale x 32 x half> poison
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fneg.v1f16(<1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fneg_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fneg <1 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = fneg <2 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fneg <4 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fneg <8 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fneg <16 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = fneg <32 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fneg <vscale x 1 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fneg <vscale x 2 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fneg <vscale x 4 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16 = fneg <vscale x 8 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16 = fneg <vscale x 16 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %NXV32F16 = fneg <vscale x 32 x half> poison
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fneg.v1f16(<1 x half> poison, <1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> poison, <2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> poison, <4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> poison, <8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> poison, <16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = fneg half undef
-
-  %V1F16 = fneg <1 x half> undef
-  %V2F16 = fneg <2 x half> undef
-  %V4F16 = fneg <4 x half> undef
-  %V8F16 = fneg <8 x half> undef
-  %V16F16 = fneg <16 x half> undef
-  %V32F16 = fneg <32 x half> undef
-
-  %NXV1F16 = fneg <vscale x 1 x half> undef
-  %NXV2F16 = fneg <vscale x 2 x half> undef
-  %NXV4F16 = fneg <vscale x 4 x half> undef
-  %NXV8F16 = fneg <vscale x 8 x half> undef
-  %NXV16F16 = fneg <vscale x 16 x half> undef
-  %NXV32F16 = fneg <vscale x 32 x half> undef
+; NO-ZFHMIN-LABEL: 'fneg_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fneg half poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = fneg <1 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fneg <2 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = fneg <4 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = fneg <8 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = fneg <16 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = fneg <32 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = fneg <vscale x 1 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = fneg <vscale x 2 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = fneg <vscale x 4 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = fneg <vscale x 8 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = fneg <vscale x 16 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = fneg <vscale x 32 x half> poison
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16_VP = call <1 x half> @llvm.vp.fneg.v1f16(<1 x half> poison, <1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16_VP = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> poison, <2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16_VP = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> poison, <4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16_VP = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> poison, <8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16_VP = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> poison, <16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fneg.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fneg.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fneg.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fneg.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %F16 = fneg half poison
+
+  %V1F16 = fneg <1 x half> poison
+  %V2F16 = fneg <2 x half> poison
+  %V4F16 = fneg <4 x half> poison
+  %V8F16 = fneg <8 x half> poison
+  %V16F16 = fneg <16 x half> poison
+  %V32F16 = fneg <32 x half> poison
+
+  %NXV1F16 = fneg <vscale x 1 x half> poison
+  %NXV2F16 = fneg <vscale x 2 x half> poison
+  %NXV4F16 = fneg <vscale x 4 x half> poison
+  %NXV8F16 = fneg <vscale x 8 x half> poison
+  %NXV16F16 = fneg <vscale x 16 x half> poison
+  %NXV32F16 = fneg <vscale x 32 x half> poison
+
+  %V1F16_VP = call <1 x half> @llvm.vp.fneg(<1 x half> poison, <1 x i1> poison, i32 poison)
+  %V2F16_VP = call <2 x half> @llvm.vp.fneg(<2 x half> poison, <2 x i1> poison, i32 poison)
+  %V4F16_VP = call <4 x half> @llvm.vp.fneg(<4 x half> poison, <4 x i1> poison, i32 poison)
+  %V8F16_VP = call <8 x half> @llvm.vp.fneg(<8 x half> poison, <8 x i1> poison, i32 poison)
+  %V16F16_VP = call <16 x half> @llvm.vp.fneg(<16 x half> poison, <16 x i1> poison, i32 poison)
+
+  %NXV1F16_VP = call <vscale x 1 x half> @llvm.vp.fneg(<vscale x 1 x half> poison, <vscale x 1 x i1> poison, i32 poison)
+  %NXV2F16_VP = call <vscale x 2 x half> @llvm.vp.fneg(<vscale x 2 x half> poison, <vscale x 2 x i1> poison, i32 poison)
+  %NXV4F16_VP = call <vscale x 4 x half> @llvm.vp.fneg(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, i32 poison)
+  %NXV8F16_VP = call <vscale x 8 x half> @llvm.vp.fneg(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, i32 poison)
+  %NXV16F16_VP = call <vscale x 16 x half> @llvm.vp.fneg(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }
 
 define void @fcopysign() {
 ; CHECK-LABEL: 'fcopysign'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16BF16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.copysign.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.copysign.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> undef, <1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.copysign.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8F32 = call <vscale x 8 x float> @llvm.copysign.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.copysign.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.copysign.v1f64(<1 x double> undef, <1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.copysign.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.copysign.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8F64 = call <vscale x 8 x double> @llvm.copysign.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> poison, <1 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> poison, <2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> poison, <4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> poison, <8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> poison, <16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.copysign.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8F32 = call <vscale x 8 x float> @llvm.copysign.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.copysign.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.copysign.v1f64(<1 x double> poison, <1 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> poison, <2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> poison, <4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> poison, <8 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.copysign.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.copysign.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8F64 = call <vscale x 8 x double> @llvm.copysign.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %BF16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
-  %F32 = call float @llvm.copysign.f32(float undef, float undef)
-  %F64 = call double @llvm.copysign.f64(double undef, double undef)
-
-  %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef)
-  %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
-  %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
-  %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
-  %V16BF16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
-
-  %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.copysign.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
-  %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
-  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
-  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
-  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.copysign.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
-
-  %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> undef, <1 x float> undef)
-  %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
-  %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-
-  %NXV1F32 = call <vscale x 1 x float> @llvm.copysign.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
-  %NXV2F32 = call <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
-  %NXV4F32 = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
-  %NXV8F32 = call <vscale x 8 x float> @llvm.copysign.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
-  %NXV16F32 = call <vscale x 16 x float> @llvm.copysign.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
-
-  %V1F64 = call <1 x double> @llvm.copysign.v1f64(<1 x double> undef, <1 x double> undef)
-  %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-
-  %NXV1F64 = call <vscale x 1 x double> @llvm.copysign.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
-  %NXV2F64 = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
-  %NXV4F64 = call <vscale x 4 x double> @llvm.copysign.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
-  %NXV8F64 = call <vscale x 8 x double> @llvm.copysign.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
+  %F32 = call float @llvm.copysign.f32(float poison, float poison)
+  %F64 = call double @llvm.copysign.f64(double poison, double poison)
+
+  %V1F32 = call <1 x float> @llvm.copysign(<1 x float> poison, <1 x float> poison)
+  %V2F32 = call <2 x float> @llvm.copysign(<2 x float> poison, <2 x float> poison)
+  %V4F32 = call <4 x float> @llvm.copysign(<4 x float> poison, <4 x float> poison)
+  %V8F32 = call <8 x float> @llvm.copysign(<8 x float> poison, <8 x float> poison)
+  %V16F32 = call <16 x float> @llvm.copysign(<16 x float> poison, <16 x float> poison)
+
+  %NXV1F32 = call <vscale x 1 x float> @llvm.copysign(<vscale x 1 x float> poison, <vscale x 1 x float> poison)
+  %NXV2F32 = call <vscale x 2 x float> @llvm.copysign(<vscale x 2 x float> poison, <vscale x 2 x float> poison)
+  %NXV4F32 = call <vscale x 4 x float> @llvm.copysign(<vscale x 4 x float> poison, <vscale x 4 x float> poison)
+  %NXV8F32 = call <vscale x 8 x float> @llvm.copysign(<vscale x 8 x float> poison, <vscale x 8 x float> poison)
+  %NXV16F32 = call <vscale x 16 x float> @llvm.copysign(<vscale x 16 x float> poison, <vscale x 16 x float> poison)
+
+  %V1F64 = call <1 x double> @llvm.copysign(<1 x double> poison, <1 x double> poison)
+  %V2F64 = call <2 x double> @llvm.copysign(<2 x double> poison, <2 x double> poison)
+  %V4F64 = call <4 x double> @llvm.copysign(<4 x double> poison, <4 x double> poison)
+  %V8F64 = call <8 x double> @llvm.copysign(<8 x double> poison, <8 x double> poison)
+
+  %NXV1F64 = call <vscale x 1 x double> @llvm.copysign(<vscale x 1 x double> poison, <vscale x 1 x double> poison)
+  %NXV2F64 = call <vscale x 2 x double> @llvm.copysign(<vscale x 2 x double> poison, <vscale x 2 x double> poison)
+  %NXV4F64 = call <vscale x 4 x double> @llvm.copysign(<vscale x 4 x double> poison, <vscale x 4 x double> poison)
+  %NXV8F64 = call <vscale x 8 x double> @llvm.copysign(<vscale x 8 x double> poison, <vscale x 8 x double> poison)
+
+  ret void
+}
+
+define void @fcopysign_bf16() {
+; ZVFH-LABEL: 'fcopysign_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.copysign.bf16(bfloat poison, bfloat poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16BF16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32BF16 = call <32 x bfloat> @llvm.copysign.v32bf16(<32 x bfloat> poison, <32 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.copysign.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.copysign.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.copysign.nxv32bf16(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fcopysign_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.copysign.bf16(bfloat poison, bfloat poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16BF16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32BF16 = call <32 x bfloat> @llvm.copysign.v32bf16(<32 x bfloat> poison, <32 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.copysign.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.copysign.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.copysign.nxv32bf16(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fcopysign_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.copysign.bf16(bfloat poison, bfloat poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32BF16 = call <32 x bfloat> @llvm.copysign.v32bf16(<32 x bfloat> poison, <32 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.copysign.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.copysign.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.copysign.nxv32bf16(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = call bfloat @llvm.copysign(bfloat poison, bfloat poison)
+
+  %V1BF16 = call <1 x bfloat> @llvm.copysign(<1 x bfloat> poison, <1 x bfloat> poison)
+  %V2BF16 = call <2 x bfloat> @llvm.copysign(<2 x bfloat> poison, <2 x bfloat> poison)
+  %V4BF16 = call <4 x bfloat> @llvm.copysign(<4 x bfloat> poison, <4 x bfloat> poison)
+  %V8BF16 = call <8 x bfloat> @llvm.copysign(<8 x bfloat> poison, <8 x bfloat> poison)
+  %V16BF16 = call <16 x bfloat> @llvm.copysign(<16 x bfloat> poison, <16 x bfloat> poison)
+  %V32BF16 = call <32 x bfloat> @llvm.copysign(<32 x bfloat> poison, <32 x bfloat> poison)
+
+  %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.copysign(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+  %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.copysign(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.copysign(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.copysign(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.copysign(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+  %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.copysign(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
 
   ret void
 }
 
 define void @fcopysign_f16() {
 ; ZVFH-LABEL: 'fcopysign_f16'
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.copysign.f16(half undef, half undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <1 x half> @llvm.copysign.v1f16(<1 x half> undef, <1 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.copysign.f16(half poison, half poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <1 x half> @llvm.copysign.v1f16(<1 x half> poison, <1 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> poison, <2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> poison, <4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> poison, <8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> poison, <16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> poison, <32 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> poison, <vscale x 32 x half> poison)
 ; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; ZVFHMIN-LABEL: 'fcopysign_f16'
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.copysign.f16(half undef, half undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F16 = call <1 x half> @llvm.copysign.v1f16(<1 x half> undef, <1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.copysign.f16(half poison, half poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F16 = call <1 x half> @llvm.copysign.v1f16(<1 x half> poison, <1 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> poison, <2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> poison, <4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> poison, <8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> poison, <16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> poison, <32 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> poison, <vscale x 32 x half> poison)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = call half @llvm.copysign.f16(half undef, half undef)
-
-  %V1F16 = call <1 x half> @llvm.copysign.v1f16(<1 x half> undef, <1 x half> undef)
-  %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-  %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-  %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-  %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
-  %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
-
-  %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
-  %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
-  %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
-  %NXV8F16 = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
-  %NXV16F16 = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
-  %NXV32F16 = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x half> undef)
+; NO-ZFHMIN-LABEL: 'fcopysign_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.copysign.f16(half poison, half poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.copysign.v1f16(<1 x half> poison, <1 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> poison, <2 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> poison, <4 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> poison, <8 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> poison, <16 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> poison, <32 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.copysign.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.copysign.nxv32f16(<vscale x 32 x half> poison, <vscale x 32 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %F16 = call half @llvm.copysign(half poison, half poison)
+
+  %V1F16 = call <1 x half> @llvm.copysign(<1 x half> poison, <1 x half> poison)
+  %V2F16 = call <2 x half> @llvm.copysign(<2 x half> poison, <2 x half> poison)
+  %V4F16 = call <4 x half> @llvm.copysign(<4 x half> poison, <4 x half> poison)
+  %V8F16 = call <8 x half> @llvm.copysign(<8 x half> poison, <8 x half> poison)
+  %V16F16 = call <16 x half> @llvm.copysign(<16 x half> poison, <16 x half> poison)
+  %V32F16 = call <32 x half> @llvm.copysign(<32 x half> poison, <32 x half> poison)
+
+  %NXV1F16 = call <vscale x 1 x half> @llvm.copysign(<vscale x 1 x half> poison, <vscale x 1 x half> poison)
+  %NXV2F16 = call <vscale x 2 x half> @llvm.copysign(<vscale x 2 x half> poison, <vscale x 2 x half> poison)
+  %NXV4F16 = call <vscale x 4 x half> @llvm.copysign(<vscale x 4 x half> poison, <vscale x 4 x half> poison)
+  %NXV8F16 = call <vscale x 8 x half> @llvm.copysign(<vscale x 8 x half> poison, <vscale x 8 x half> poison)
+  %NXV16F16 = call <vscale x 16 x half> @llvm.copysign(<vscale x 16 x half> poison, <vscale x 16 x half> poison)
+  %NXV32F16 = call <vscale x 32 x half> @llvm.copysign(<vscale x 32 x half> poison, <vscale x 32 x half> poison)
 
   ret void
 }
 
 define void @fma() {
 ; CHECK-LABEL: 'fma'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.fma.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> undef, <1 x double> undef, <1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.fma.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = call <vscale x 8 x double> @llvm.fma.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float poison, float poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double poison, double poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> poison, <1 x float> poison, <1 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> poison, <16 x float> poison, <16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> poison, <1 x double> poison, <1 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> poison, <8 x double> poison, <8 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.fma.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = call <vscale x 8 x double> @llvm.fma.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x double> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-  %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-  %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-
-  %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef)
-  %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-  %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-  %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
-  %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-
-  %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.fma.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
-  %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
-  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
-  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
-  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
-
-  %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef)
-  %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-  %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
-
-  %NXV1F32 = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef)
-  %NXV2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-  %NXV4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-  %NXV8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-  %NXV16F32 = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef)
-
-  %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> undef, <1 x double> undef, <1 x double> undef)
-  %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-
-  %NXV1F64 = call <vscale x 1 x double> @llvm.fma.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef)
-  %NXV2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-  %NXV4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
-  %NXV8F64 = call <vscale x 8 x double> @llvm.fma.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef)
+  %F32 = call float @llvm.fma(float poison, float poison, float poison)
+  %F64 = call double @llvm.fma(double poison, double poison, double poison)
+
+  %V1F32 = call <1 x float> @llvm.fma(<1 x float> poison, <1 x float> poison, <1 x float> poison)
+  %V2F32 = call <2 x float> @llvm.fma(<2 x float> poison, <2 x float> poison, <2 x float> poison)
+  %V4F32 = call <4 x float> @llvm.fma(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+  %V8F32 = call <8 x float> @llvm.fma(<8 x float> poison, <8 x float> poison, <8 x float> poison)
+  %V16F32 = call <16 x float> @llvm.fma(<16 x float> poison, <16 x float> poison, <16 x float> poison)
+
+  %NXV1F32 = call <vscale x 1 x float> @llvm.fma(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x float> poison)
+  %NXV2F32 = call <vscale x 2 x float> @llvm.fma(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+  %NXV4F32 = call <vscale x 4 x float> @llvm.fma(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+  %NXV8F32 = call <vscale x 8 x float> @llvm.fma(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+  %NXV16F32 = call <vscale x 16 x float> @llvm.fma(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x float> poison)
+
+  %V1F64 = call <1 x double> @llvm.fma(<1 x double> poison, <1 x double> poison, <1 x double> poison)
+  %V2F64 = call <2 x double> @llvm.fma(<2 x double> poison, <2 x double> poison, <2 x double> poison)
+  %V4F64 = call <4 x double> @llvm.fma(<4 x double> poison, <4 x double> poison, <4 x double> poison)
+  %V8F64 = call <8 x double> @llvm.fma(<8 x double> poison, <8 x double> poison, <8 x double> poison)
+
+  %NXV1F64 = call <vscale x 1 x double> @llvm.fma(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x double> poison)
+  %NXV2F64 = call <vscale x 2 x double> @llvm.fma(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+  %NXV4F64 = call <vscale x 4 x double> @llvm.fma(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
+  %NXV8F64 = call <vscale x 8 x double> @llvm.fma(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x double> poison)
+
+  ret void
+}
+
+define void @fma_bf16() {
+; ZVFH-LABEL: 'fma_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32BF16 = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> poison, <32 x bfloat> poison, <32 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.fma.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.fma.nxv32bf16(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fma_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32BF16 = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> poison, <32 x bfloat> poison, <32 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.fma.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.fma.nxv32bf16(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fma_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> poison, <1 x bfloat> poison, <1 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32BF16 = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> poison, <32 x bfloat> poison, <32 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.fma.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.fma.nxv32bf16(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %BF16 = call bfloat @llvm.fma(bfloat poison, bfloat poison, bfloat poison)
+
+  %V1BF16 = call <1 x bfloat> @llvm.fma(<1 x bfloat> poison, <1 x bfloat> poison, <1 x bfloat> poison)
+  %V2BF16 = call <2 x bfloat> @llvm.fma(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+  %V4BF16 = call <4 x bfloat> @llvm.fma(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+  %V8BF16 = call <8 x bfloat> @llvm.fma(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+  %V16BF16 = call <16 x bfloat> @llvm.fma(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+  %V32BF16 = call <32 x bfloat> @llvm.fma(<32 x bfloat> poison, <32 x bfloat> poison, <32 x bfloat> poison)
+
+  %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.fma(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+  %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.fma(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+  %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+  %NXV8BF16 = call <vscale x 8 x bfloat> @llvm.fma(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+  %NXV16BF16 = call <vscale x 16 x bfloat> @llvm.fma(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+  %NXV32BF16 = call <vscale x 32 x bfloat> @llvm.fma(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> poison)
 
   ret void
 }
 
 define void @fma_f16() {
-; CHECK-LABEL: 'fma_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> undef, <1 x half> undef, <1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.fma.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x half> undef, <vscale x 32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fma_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> poison, <1 x half> poison, <1 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> poison, <32 x half> poison, <32 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.fma.nxv32f16(<vscale x 32 x half> poison, <vscale x 32 x half> poison, <vscale x 32 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fma_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> poison, <1 x half> poison, <1 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> poison, <32 x half> poison, <32 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.fma.nxv32f16(<vscale x 32 x half> poison, <vscale x 32 x half> poison, <vscale x 32 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fma_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> poison, <1 x half> poison, <1 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> poison, <32 x half> poison, <32 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = call <vscale x 32 x half> @llvm.fma.nxv32f16(<vscale x 32 x half> poison, <vscale x 32 x half> poison, <vscale x 32 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %F16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-
-  %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> undef, <1 x half> undef, <1 x half> undef)
-  %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-  %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-  %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-  %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-  %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
-
-  %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
-  %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
-  %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-  %NXV8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-  %NXV16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-  %NXV32F16 = call <vscale x 32 x half> @llvm.fma.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x half> undef, <vscale x 32 x half> undef)
+  %F16 = call half @llvm.fma(half poison, half poison, half poison)
+
+  %V1F16 = call <1 x half> @llvm.fma(<1 x half> poison, <1 x half> poison, <1 x half> poison)
+  %V2F16 = call <2 x half> @llvm.fma(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+  %V4F16 = call <4 x half> @llvm.fma(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+  %V8F16 = call <8 x half> @llvm.fma(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+  %V16F16 = call <16 x half> @llvm.fma(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+  %V32F16 = call <32 x half> @llvm.fma(<32 x half> poison, <32 x half> poison, <32 x half> poison)
+
+  %NXV1F16 = call <vscale x 1 x half> @llvm.fma(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+  %NXV2F16 = call <vscale x 2 x half> @llvm.fma(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+  %NXV4F16 = call <vscale x 4 x half> @llvm.fma(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+  %NXV8F16 = call <vscale x 8 x half> @llvm.fma(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+  %NXV16F16 = call <vscale x 16 x half> @llvm.fma(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+  %NXV32F16 = call <vscale x 32 x half> @llvm.fma(<vscale x 32 x half> poison, <vscale x 32 x half> poison, <vscale x 32 x half> poison)
 
   ret void
 }
 
 define void @fmuladd() {
 ; CHECK-LABEL: 'fmuladd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x bfloat> @llvm.fmuladd.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x bfloat> @llvm.fmuladd.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <vscale x 1 x float> @llvm.fmuladd.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %25 = call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 1 x double> @llvm.fmuladd.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %28 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %29 = call <vscale x 8 x double> @llvm.fmuladd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %30 = call <vscale x 16 x double> @llvm.fmuladd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call float @llvm.fmuladd.f32(float poison, float poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call double @llvm.fmuladd.f64(double poison, double poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> poison, <16 x float> poison, <16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> poison, <8 x double> poison, <8 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %10 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> poison, <16 x double> poison, <16 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <vscale x 1 x float> @llvm.fmuladd.nxv1f32(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %15 = call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x float> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x double> @llvm.fmuladd.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %19 = call <vscale x 8 x double> @llvm.fmuladd.nxv8f64(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x double> poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %20 = call <vscale x 16 x double> @llvm.fmuladd.nxv16f64(<vscale x 16 x double> poison, <vscale x 16 x double> poison, <vscale x 16 x double> poison)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
-  call float @llvm.fmuladd.f32(float undef, float undef, float undef)
-  call double @llvm.fmuladd.f64(double undef, double undef, double undef)
-  call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-  call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-  call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
-  call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-  call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-  call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-  call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-  call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
-  call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-  call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-  call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-  call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
-  call <vscale x 1 x bfloat> @llvm.fmuladd.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
-  call <vscale x 2 x bfloat> @llvm.fmuladd.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
-  call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
-  call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
-  call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
-  call <vscale x 1 x float> @llvm.fmuladd.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef)
-  call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-  call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-  call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-  call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef)
-  call <vscale x 1 x double> @llvm.fmuladd.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef)
-  call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-  call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
-  call <vscale x 8 x double> @llvm.fmuladd.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef)
-  call <vscale x 16 x double> @llvm.fmuladd.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, <vscale x 16 x double> undef)
+  call float @llvm.fmuladd(float poison, float poison, float poison)
+  call double @llvm.fmuladd(double poison, double poison, double poison)
+
+  call <2 x float> @llvm.fmuladd(<2 x float> poison, <2 x float> poison, <2 x float> poison)
+  call <4 x float> @llvm.fmuladd(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+  call <8 x float> @llvm.fmuladd(<8 x float> poison, <8 x float> poison, <8 x float> poison)
+  call <16 x float> @llvm.fmuladd(<16 x float> poison, <16 x float> poison, <16 x float> poison)
+
+  call <2 x double> @llvm.fmuladd(<2 x double> poison, <2 x double> poison, <2 x double> poison)
+  call <4 x double> @llvm.fmuladd(<4 x double> poison, <4 x double> poison, <4 x double> poison)
+  call <8 x double> @llvm.fmuladd(<8 x double> poison, <8 x double> poison, <8 x double> poison)
+  call <16 x double> @llvm.fmuladd(<16 x double> poison, <16 x double> poison, <16 x double> poison)
+
+  call <vscale x 1 x float> @llvm.fmuladd(<vscale x 1 x float> poison, <vscale x 1 x float> poison, <vscale x 1 x float> poison)
+  call <vscale x 2 x float> @llvm.fmuladd(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+  call <vscale x 4 x float> @llvm.fmuladd(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+  call <vscale x 8 x float> @llvm.fmuladd(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+  call <vscale x 16 x float> @llvm.fmuladd(<vscale x 16 x float> poison, <vscale x 16 x float> poison, <vscale x 16 x float> poison)
+
+  call <vscale x 1 x double> @llvm.fmuladd(<vscale x 1 x double> poison, <vscale x 1 x double> poison, <vscale x 1 x double> poison)
+  call <vscale x 2 x double> @llvm.fmuladd(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+  call <vscale x 4 x double> @llvm.fmuladd(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
+  call <vscale x 8 x double> @llvm.fmuladd(<vscale x 8 x double> poison, <vscale x 8 x double> poison, <vscale x 8 x double> poison)
+  call <vscale x 16 x double> @llvm.fmuladd(<vscale x 16 x double> poison, <vscale x 16 x double> poison, <vscale x 16 x double> poison)
+
+  ret void
+}
+
+define void @fmuladd_bf16() {
+; ZVFH-LABEL: 'fmuladd_bf16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fmuladd.bf16(bfloat poison, bfloat poison, bfloat poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.fmuladd.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.fmuladd.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fmuladd_bf16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fmuladd.bf16(bfloat poison, bfloat poison, bfloat poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.fmuladd.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.fmuladd.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fmuladd_bf16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fmuladd.bf16(bfloat poison, bfloat poison, bfloat poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %5 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.fmuladd.nxv1bf16(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.fmuladd.nxv2bf16(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.fmuladd.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.fmuladd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.fmuladd.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call bfloat @llvm.fmuladd(bfloat poison, bfloat poison, bfloat poison)
+
+  call <2 x bfloat> @llvm.fmuladd(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+  call <4 x bfloat> @llvm.fmuladd(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+  call <8 x bfloat> @llvm.fmuladd(<8 x bfloat> poison, <8 x bfloat> poison, <8 x bfloat> poison)
+  call <16 x bfloat> @llvm.fmuladd(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+
+  call <vscale x 1 x bfloat> @llvm.fmuladd(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> poison)
+  call <vscale x 2 x bfloat> @llvm.fmuladd(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> poison)
+  call <vscale x 4 x bfloat> @llvm.fmuladd(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> poison)
+  call <vscale x 8 x bfloat> @llvm.fmuladd(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> poison)
+  call <vscale x 16 x bfloat> @llvm.fmuladd(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> poison)
   ret void
 }
 
 define void @fmuladd_f16() {
-; CHECK-LABEL: 'fmuladd_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVFH-LABEL: 'fmuladd_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fmuladd.f16(half poison, half poison, half poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'fmuladd_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fmuladd.f16(half poison, half poison, half poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; NO-ZFHMIN-LABEL: 'fmuladd_f16'
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.fmuladd.f16(half poison, half poison, half poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %5 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; NO-ZFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call half @llvm.fmuladd.f16(half undef, half undef, half undef)
-  call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-  call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-  call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-  call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-  call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
-  call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
-  call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-  call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-  call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+  call half @llvm.fmuladd(half poison, half poison, half poison)
+
+  call <2 x half> @llvm.fmuladd(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+  call <4 x half> @llvm.fmuladd(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+  call <8 x half> @llvm.fmuladd(<8 x half> poison, <8 x half> poison, <8 x half> poison)
+  call <16 x half> @llvm.fmuladd(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+
+  call <vscale x 1 x half> @llvm.fmuladd(<vscale x 1 x half> poison, <vscale x 1 x half> poison, <vscale x 1 x half> poison)
+  call <vscale x 2 x half> @llvm.fmuladd(<vscale x 2 x half> poison, <vscale x 2 x half> poison, <vscale x 2 x half> poison)
+  call <vscale x 4 x half> @llvm.fmuladd(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+  call <vscale x 8 x half> @llvm.fmuladd(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+  call <vscale x 16 x half> @llvm.fmuladd(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
index 892277a..68c89c3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
@@ -13,14 +13,14 @@ define void @fixed() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/X86/fshl.ll b/llvm/test/Analysis/CostModel/X86/fshl.ll
index 485e7f6..c53a507 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl.ll
@@ -414,8 +414,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:13 SizeLat:11 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:21 Lat:23 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:42 Lat:46 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -423,8 +423,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:11 SizeLat:11 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:14 SizeLat:14 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:14 Lat:28 SizeLat:28 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -432,8 +432,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i64'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -441,8 +441,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i64'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -450,8 +450,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i64'
 ; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -459,8 +459,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i64'
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -486,8 +486,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; XOP-LABEL: 'splatvar_funnel_i64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:21 Lat:23 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:42 Lat:46 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -495,8 +495,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -532,8 +532,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:32 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:64 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -541,8 +541,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:9 Lat:14 SizeLat:16 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:18 Lat:28 SizeLat:32 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -550,8 +550,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i32'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -559,8 +559,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i32'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -568,8 +568,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i32'
 ; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -577,8 +577,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i32'
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -604,8 +604,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; XOP-LABEL: 'splatvar_funnel_i32'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:32 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:64 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -613,8 +613,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -631,108 +631,108 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 
 define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
 ; SSSE3-LABEL: 'splatvar_funnel_i16'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:9 Lat:10 SizeLat:11 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:17 Lat:19 SizeLat:21 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:33 Lat:37 SizeLat:41 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'splatvar_funnel_i16'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:13 Lat:19 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:25 Lat:37 SizeLat:37 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:25 Lat:23 SizeLat:33 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:50 Lat:46 SizeLat:66 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:9 Lat:14 SizeLat:16 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:18 Lat:28 SizeLat:32 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i16'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:26 Lat:38 SizeLat:30 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i16'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i16'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:26 Lat:38 SizeLat:30 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i16'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_funnel_i16'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:7 Lat:12 SizeLat:11 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:13 Lat:23 SizeLat:21 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:25 Lat:45 SizeLat:41 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_funnel_i16'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:13 Lat:19 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:25 Lat:37 SizeLat:37 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_funnel_i16'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:25 Lat:23 SizeLat:33 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:50 Lat:46 SizeLat:66 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
@@ -749,108 +749,108 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 
 define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
 ; SSSE3-LABEL: 'splatvar_funnel_i8'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:18 Lat:29 SizeLat:25 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:35 Lat:57 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:69 Lat:113 SizeLat:97 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'splatvar_funnel_i8'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:16 Lat:29 SizeLat:24 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:31 Lat:57 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:61 Lat:113 SizeLat:93 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:18 SizeLat:22 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:36 Lat:26 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:72 Lat:52 SizeLat:94 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:20 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:21 SizeLat:27 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:42 SizeLat:54 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i8'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:19 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:24 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:73 Lat:56 SizeLat:85 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i8'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:20 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:22 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:23 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i8'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:19 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:24 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:73 Lat:56 SizeLat:85 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i8'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:20 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:22 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:23 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_funnel_i8'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:16 Lat:31 SizeLat:25 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:31 Lat:61 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:104 CodeSize:61 Lat:121 SizeLat:97 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_funnel_i8'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:16 Lat:29 SizeLat:24 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:31 Lat:57 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:61 Lat:113 SizeLat:93 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_funnel_i8'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:7 Lat:12 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:25 Lat:23 SizeLat:31 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:50 Lat:46 SizeLat:62 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:20 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:22 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:23 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
@@ -1934,8 +1934,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:18 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:36 SizeLat:44 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -1943,8 +1943,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:11 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:22 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -1952,8 +1952,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512-LABEL: 'splatvar_rotate_i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -1979,8 +1979,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; XOP-LABEL: 'splatvar_rotate_i64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:7 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -2007,8 +2007,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:17 Lat:18 SizeLat:24 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:34 Lat:36 SizeLat:48 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2016,8 +2016,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:22 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2025,8 +2025,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512-LABEL: 'splatvar_rotate_i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2052,8 +2052,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; XOP-LABEL: 'splatvar_rotate_i32'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:7 SizeLat:6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2070,99 +2070,99 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 
 define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
 ; SSE-LABEL: 'splatvar_rotate_i16'
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:17 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:17 Lat:18 SizeLat:24 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:34 Lat:36 SizeLat:48 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:22 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_rotate_i16'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:10 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:29 SizeLat:21 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_rotate_i16'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:6 Lat:8 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:6 Lat:8 SizeLat:8 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_rotate_i16'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:10 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:29 SizeLat:21 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_rotate_i16'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_rotate_i16'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:17 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_rotate_i16'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:17 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_rotate_i16'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:7 SizeLat:6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
@@ -2179,99 +2179,99 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 
 define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
 ; SSE-LABEL: 'splatvar_rotate_i8'
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:14 Lat:26 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:27 Lat:51 SizeLat:41 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:53 Lat:101 SizeLat:81 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:19 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:28 Lat:21 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:56 Lat:42 SizeLat:76 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:17 SizeLat:18 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:16 Lat:18 SizeLat:23 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:32 Lat:36 SizeLat:46 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_rotate_i8'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:17 SizeLat:18 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:16 Lat:18 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:64 Lat:47 SizeLat:76 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_rotate_i8'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:15 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:15 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:11 Lat:6 SizeLat:12 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_rotate_i8'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:17 SizeLat:18 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:16 Lat:18 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:64 Lat:47 SizeLat:76 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_rotate_i8'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:15 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:15 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:11 Lat:6 SizeLat:12 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_rotate_i8'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:14 Lat:26 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:27 Lat:51 SizeLat:41 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:53 Lat:101 SizeLat:81 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_rotate_i8'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:14 Lat:26 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:27 Lat:51 SizeLat:41 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:53 Lat:101 SizeLat:81 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_rotate_i8'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:7 SizeLat:6 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:12 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:15 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:15 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:11 Lat:6 SizeLat:12 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr.ll b/llvm/test/Analysis/CostModel/X86/fshr.ll
index af9a91b..1990605 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr.ll
@@ -414,8 +414,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:13 SizeLat:11 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:21 Lat:23 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:42 Lat:46 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -423,8 +423,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:11 SizeLat:11 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:14 SizeLat:14 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:14 Lat:28 SizeLat:28 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -432,8 +432,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i64'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -441,8 +441,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i64'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -450,8 +450,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i64'
 ; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:13 SizeLat:9 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -459,8 +459,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i64'
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -486,8 +486,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; XOP-LABEL: 'splatvar_funnel_i64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:21 Lat:23 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:42 Lat:46 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -495,8 +495,8 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
@@ -532,8 +532,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:32 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:64 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -541,8 +541,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:9 Lat:14 SizeLat:16 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:18 Lat:28 SizeLat:32 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -550,8 +550,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i32'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -559,8 +559,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i32'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -568,8 +568,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i32'
 ; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -577,8 +577,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i32'
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -604,8 +604,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; XOP-LABEL: 'splatvar_funnel_i32'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:32 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:46 Lat:46 SizeLat:64 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -613,8 +613,8 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
@@ -631,108 +631,108 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 
 define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
 ; SSSE3-LABEL: 'splatvar_funnel_i16'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:9 Lat:10 SizeLat:11 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:17 Lat:19 SizeLat:21 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:33 Lat:37 SizeLat:41 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'splatvar_funnel_i16'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:13 Lat:19 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:25 Lat:37 SizeLat:37 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:25 Lat:23 SizeLat:33 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:50 Lat:46 SizeLat:66 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:9 Lat:14 SizeLat:16 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:18 Lat:28 SizeLat:32 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i16'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:26 Lat:38 SizeLat:30 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i16'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:11 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i16'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:9 SizeLat:9 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:26 Lat:38 SizeLat:30 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i16'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_funnel_i16'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:7 Lat:12 SizeLat:11 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:13 Lat:23 SizeLat:21 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:25 Lat:45 SizeLat:41 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_funnel_i16'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:7 Lat:10 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:13 Lat:19 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:25 Lat:37 SizeLat:37 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_funnel_i16'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:12 SizeLat:10 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:25 Lat:23 SizeLat:33 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:50 Lat:46 SizeLat:66 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
@@ -749,108 +749,108 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 
 define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
 ; SSSE3-LABEL: 'splatvar_funnel_i8'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:18 Lat:29 SizeLat:25 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:35 Lat:57 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:69 Lat:113 SizeLat:97 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'splatvar_funnel_i8'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:16 Lat:29 SizeLat:24 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:31 Lat:57 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:61 Lat:113 SizeLat:93 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_funnel_i8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:16 Lat:18 SizeLat:22 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:36 Lat:26 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:72 Lat:52 SizeLat:94 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_funnel_i8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:20 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:21 SizeLat:27 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:42 SizeLat:54 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_funnel_i8'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:19 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:24 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:73 Lat:56 SizeLat:85 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_funnel_i8'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:20 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:22 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:23 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_funnel_i8'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:19 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:24 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:73 Lat:56 SizeLat:85 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_funnel_i8'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:20 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:22 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:23 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_funnel_i8'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:16 Lat:31 SizeLat:25 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:31 Lat:61 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:104 CodeSize:61 Lat:121 SizeLat:97 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_funnel_i8'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:16 Lat:29 SizeLat:24 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:31 Lat:57 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:96 CodeSize:61 Lat:113 SizeLat:93 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_funnel_i8'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:7 Lat:12 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:25 Lat:23 SizeLat:31 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:50 Lat:46 SizeLat:62 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:20 SizeLat:20 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:22 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:18 Lat:20 SizeLat:23 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
@@ -1934,8 +1934,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:18 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:36 SizeLat:44 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -1943,8 +1943,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:11 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:22 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -1952,8 +1952,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; AVX512-LABEL: 'splatvar_rotate_i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -1979,8 +1979,8 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; XOP-LABEL: 'splatvar_rotate_i64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:3 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:7 SizeLat:9 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:16 Lat:14 SizeLat:18 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
@@ -2007,8 +2007,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:17 Lat:18 SizeLat:24 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:34 Lat:36 SizeLat:48 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2016,8 +2016,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:22 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2025,8 +2025,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; AVX512-LABEL: 'splatvar_rotate_i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2052,8 +2052,8 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ;
 ; XOP-LABEL: 'splatvar_rotate_i32'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:3 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:7 SizeLat:9 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:16 Lat:14 SizeLat:18 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
@@ -2070,99 +2070,99 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 
 define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) {
 ; SSE-LABEL: 'splatvar_rotate_i16'
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:17 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:9 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:17 Lat:18 SizeLat:24 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:34 Lat:36 SizeLat:48 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:14 Lat:22 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_rotate_i16'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:10 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:29 SizeLat:21 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_rotate_i16'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:6 Lat:8 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:6 Lat:8 SizeLat:8 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_rotate_i16'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:7 Lat:11 SizeLat:10 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:17 Lat:29 SizeLat:21 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_rotate_i16'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_rotate_i16'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:17 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_rotate_i16'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:5 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:17 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_rotate_i16'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 3 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:3 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:7 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:16 Lat:14 SizeLat:18 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
@@ -2179,99 +2179,99 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 
 define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) {
 ; SSE-LABEL: 'splatvar_rotate_i8'
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:14 Lat:26 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:27 Lat:51 SizeLat:41 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:53 Lat:101 SizeLat:81 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'splatvar_rotate_i8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:19 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:28 Lat:21 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:56 Lat:42 SizeLat:76 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'splatvar_rotate_i8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:17 SizeLat:18 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:16 Lat:18 SizeLat:23 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:32 Lat:36 SizeLat:46 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'splatvar_rotate_i8'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:17 SizeLat:18 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:16 Lat:18 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:64 Lat:47 SizeLat:76 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'splatvar_rotate_i8'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:14 SizeLat:9 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:14 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:12 Lat:6 SizeLat:14 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512DQ-LABEL: 'splatvar_rotate_i8'
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512DQ-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:13 Lat:17 SizeLat:18 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:16 Lat:18 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:64 Lat:47 SizeLat:76 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512DQ-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI2-LABEL: 'splatvar_rotate_i8'
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512VBMI2-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:14 SizeLat:9 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:14 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:12 Lat:6 SizeLat:14 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; AVX512VBMI2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SLM-LABEL: 'splatvar_rotate_i8'
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:14 Lat:26 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:27 Lat:51 SizeLat:41 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:53 Lat:101 SizeLat:81 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; GLM-LABEL: 'splatvar_rotate_i8'
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; GLM-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; GLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; GLM-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:14 Lat:26 SizeLat:21 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:27 Lat:51 SizeLat:41 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:53 Lat:101 SizeLat:81 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; GLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'splatvar_rotate_i8'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOP-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:3 SizeLat:3 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:7 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:16 Lat:14 SizeLat:18 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:14 SizeLat:9 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:6 Lat:14 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:12 Lat:6 SizeLat:14 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
diff --git a/llvm/test/Analysis/CostModel/X86/load-broadcast.ll b/llvm/test/Analysis/CostModel/X86/load-broadcast.ll
index 4cd8e5b..ecf54ce 100644
--- a/llvm/test/Analysis/CostModel/X86/load-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/X86/load-broadcast.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512
-;
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+sse2 | FileCheck %s -check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; vXf64
@@ -18,24 +18,24 @@
 
 define <2 x double> @broadcast_load_v2f64_v2f64(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v2f64_v2f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v2f64_v2f64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v2f64_v2f64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v2f64_v2f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %bcst
 ;
   %load = load <2 x double>, ptr %src
   %bcst = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
@@ -44,24 +44,24 @@ define <2 x double> @broadcast_load_v2f64_v2f64(ptr %src) {
 
 define <4 x double> @broadcast_load_v2f64_v4f64(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v2f64_v4f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v2f64_v4f64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v2f64_v4f64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v2f64_v4f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
   %load = load <2 x double>, ptr %src
   %bcst = shufflevector <2 x double> %load, <2 x double> poison, <4 x i32> zeroinitializer
@@ -70,24 +70,24 @@ define <4 x double> @broadcast_load_v2f64_v4f64(ptr %src) {
 
 define <8 x double> @broadcast_load_v2f64_v8f64(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v2f64_v8f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v2f64_v8f64'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v2f64_v8f64'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v2f64_v8f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x double>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
   %load = load <2 x double>, ptr %src
   %bcst = shufflevector <2 x double> %load, <2 x double> poison, <8 x i32> zeroinitializer
@@ -96,16 +96,16 @@ define <8 x double> @broadcast_load_v2f64_v8f64(ptr %src) {
 
 define <2 x double> @broadcast_load_f64_v2f64(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_f64_v2f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <2 x double> poison, double %load, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x double> %insert, <2 x double> poison, <2 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <2 x double> poison, double %load, i32 0
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x double> %insert, <2 x double> poison, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_f64_v2f64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <2 x double> poison, double %load, i32 0
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x double> %insert, <2 x double> poison, <2 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x double> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <2 x double> poison, double %load, i32 0
+; AVX-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x double> %insert, <2 x double> poison, <2 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %bcst
 ;
   %load = load double, ptr %src
   %insert = insertelement <2 x double> poison, double %load, i32 0
@@ -115,28 +115,28 @@ define <2 x double> @broadcast_load_f64_v2f64(ptr %src) {
 
 define <4 x double> @broadcast_load_f64_v4f64(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_f64_v4f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <4 x double> poison, double %load, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <4 x double> poison, double %load, i32 0
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_f64_v4f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <4 x double> poison, double %load, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <4 x double> poison, double %load, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_f64_v4f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <4 x double> poison, double %load, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <4 x double> poison, double %load, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_f64_v4f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <4 x double> poison, double %load, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <4 x double> poison, double %load, i32 0
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bcst = shufflevector <4 x double> %insert, <4 x double> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x double> %bcst
 ;
   %load = load double, ptr %src
   %insert = insertelement <4 x double> poison, double %load, i32 0
@@ -146,28 +146,28 @@ define <4 x double> @broadcast_load_f64_v4f64(ptr %src) {
 
 define <8 x double> @broadcast_load_f64_v8f64(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_f64_v8f64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <8 x double> poison, double %load, i32 0
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; SSE-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <8 x double> poison, double %load, i32 0
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_f64_v8f64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <8 x double> poison, double %load, i32 0
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; AVX1-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <8 x double> poison, double %load, i32 0
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_f64_v8f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <8 x double> poison, double %load, i32 0
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <8 x double> poison, double %load, i32 0
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_f64_v8f64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load double, ptr %src, align 8
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %insert = insertelement <8 x double> poison, double %load, i32 0
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x double> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load double, ptr %src, align 8
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %insert = insertelement <8 x double> poison, double %load, i32 0
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %bcst = shufflevector <8 x double> %insert, <8 x double> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x double> %bcst
 ;
   %load = load double, ptr %src
   %insert = insertelement <8 x double> poison, double %load, i32 0
@@ -181,14 +181,14 @@ define <8 x double> @broadcast_load_f64_v8f64(ptr %src) {
 
 define <4 x float> @broadcast_load_v4f32_v4f32(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v4f32_v4f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x float>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x float>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v4f32_v4f32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x float>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x float>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %bcst
 ;
   %load = load <4 x float>, ptr %src
   %bcst = shufflevector <4 x float> %load, <4 x float> poison, <4 x i32> zeroinitializer
@@ -197,14 +197,14 @@ define <4 x float> @broadcast_load_v4f32_v4f32(ptr %src) {
 
 define <8 x float> @broadcast_load_v4f32_v8f32(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v4f32_v8f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x float>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x float>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v4f32_v8f32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x float>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <8 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x float> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x float>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %bcst
 ;
   %load = load <4 x float>, ptr %src
   %bcst = shufflevector <4 x float> %load, <4 x float> poison, <8 x i32> zeroinitializer
@@ -213,14 +213,14 @@ define <8 x float> @broadcast_load_v4f32_v8f32(ptr %src) {
 
 define <16 x float> @broadcast_load_v4f32_v16f32(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v4f32_v16f32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x float>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x float>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v4f32_v16f32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x float>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <16 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x float> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x float>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <4 x float> %load, <4 x float> poison, <16 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %bcst
 ;
   %load = load <4 x float>, ptr %src
   %bcst = shufflevector <4 x float> %load, <4 x float> poison, <16 x i32> zeroinitializer
@@ -229,14 +229,14 @@ define <16 x float> @broadcast_load_v4f32_v16f32(ptr %src) {
 
 define <2 x i64> @broadcast_load_v2i64_v2i64(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v2i64_v2i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x i64>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <2 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x i64>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <2 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v2i64_v2i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x i64>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <2 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x i64>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <2 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %bcst
 ;
   %load = load <2 x i64>, ptr %src
   %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <2 x i32> zeroinitializer
@@ -249,14 +249,14 @@ define <2 x i64> @broadcast_load_v2i64_v2i64(ptr %src) {
 
 define <4 x i64> @broadcast_load_v2i64_v4i64(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v2i64_v4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x i64>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x i64>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v2i64_v4i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x i64>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x i64>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %bcst
 ;
   %load = load <2 x i64>, ptr %src
   %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <4 x i32> zeroinitializer
@@ -265,14 +265,14 @@ define <4 x i64> @broadcast_load_v2i64_v4i64(ptr %src) {
 
 define <8 x i64> @broadcast_load_v2i64_v8i64(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v2i64_v8i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x i64>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x i64>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v2i64_v8i64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x i64>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <8 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <2 x i64>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %bcst
 ;
   %load = load <2 x i64>, ptr %src
   %bcst = shufflevector <2 x i64> %load, <2 x i64> poison, <8 x i32> zeroinitializer
@@ -285,14 +285,14 @@ define <8 x i64> @broadcast_load_v2i64_v8i64(ptr %src) {
 
 define <4 x i32> @broadcast_load_v4i32_v4i32(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v4i32_v4i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x i32>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x i32>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <4 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v4i32_v4i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x i32>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x i32>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <4 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %bcst
 ;
   %load = load <4 x i32>, ptr %src
   %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -301,14 +301,14 @@ define <4 x i32> @broadcast_load_v4i32_v4i32(ptr %src) {
 
 define <8 x i32> @broadcast_load_v4i32_v8i32(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v4i32_v8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x i32>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <8 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x i32>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v4i32_v8i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x i32>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <8 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x i32>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %bcst
 ;
   %load = load <4 x i32>, ptr %src
   %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <8 x i32> zeroinitializer
@@ -317,14 +317,14 @@ define <8 x i32> @broadcast_load_v4i32_v8i32(ptr %src) {
 
 define <16 x i32> @broadcast_load_v4i32_v16i32(ptr %src) {
 ; SSE-LABEL: 'broadcast_load_v4i32_v16i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x i32>, ptr %src, align 16
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <16 x i32> zeroinitializer
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bcst
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x i32>, ptr %src, align 16
+; SSE-NEXT:  Cost Model: Found costs of 1 for: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %bcst
 ;
 ; AVX-LABEL: 'broadcast_load_v4i32_v16i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <4 x i32>, ptr %src, align 16
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <16 x i32> zeroinitializer
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bcst
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <4 x i32>, ptr %src, align 16
+; AVX-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <16 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %bcst
 ;
   %load = load <4 x i32>, ptr %src
   %bcst = shufflevector <4 x i32> %load, <4 x i32> poison, <16 x i32> zeroinitializer
@@ -337,34 +337,34 @@ define <16 x i32> @broadcast_load_v4i32_v16i32(ptr %src) {
 
 define <8 x i16> @broadcast_load_v8i16_v8i16(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v8i16_v8i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v8i16_v8i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v8i16_v8i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_v8i16_v8i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_v8i16_v8i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_v8i16_v8i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %bcst
 ;
   %load = load <8 x i16>, ptr %src
   %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -373,34 +373,34 @@ define <8 x i16> @broadcast_load_v8i16_v8i16(ptr %src) {
 
 define <16 x i16> @broadcast_load_v8i16_v16i16(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v8i16_v16i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v8i16_v16i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v8i16_v16i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_v8i16_v16i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_v8i16_v16i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_v8i16_v16i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %bcst
 ;
   %load = load <8 x i16>, ptr %src
   %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <16 x i32> zeroinitializer
@@ -409,34 +409,34 @@ define <16 x i16> @broadcast_load_v8i16_v16i16(ptr %src) {
 
 define <32 x i16> @broadcast_load_v8i16_v32i16(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v8i16_v32i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v8i16_v32i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v8i16_v32i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_v8i16_v32i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_v8i16_v32i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_v8i16_v32i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <8 x i16>, ptr %src, align 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <8 x i16>, ptr %src, align 16
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %bcst
 ;
   %load = load <8 x i16>, ptr %src
   %bcst = shufflevector <8 x i16> %load, <8 x i16> poison, <32 x i32> zeroinitializer
@@ -449,34 +449,34 @@ define <32 x i16> @broadcast_load_v8i16_v32i16(ptr %src) {
 
 define <16 x i8> @broadcast_load_v16i8_v16i8(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v16i8_v16i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v16i8_v16i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v16i8_v16i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_v16i8_v16i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_v16i8_v16i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_v16i8_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %bcst
 ;
   %load = load <16 x i8>, ptr %src
   %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -485,34 +485,34 @@ define <16 x i8> @broadcast_load_v16i8_v16i8(ptr %src) {
 
 define <32 x i8> @broadcast_load_v16i8_v32i8(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v16i8_v32i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v16i8_v32i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v16i8_v32i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_v16i8_v32i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_v16i8_v32i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_v16i8_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %bcst
 ;
   %load = load <16 x i8>, ptr %src
   %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <32 x i32> zeroinitializer
@@ -521,34 +521,34 @@ define <32 x i8> @broadcast_load_v16i8_v32i8(ptr %src) {
 
 define <64 x i8> @broadcast_load_v16i8_v64i8(ptr %src) {
 ; SSE2-LABEL: 'broadcast_load_v16i8_v64i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bcst
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %bcst
 ;
 ; SSSE3-LABEL: 'broadcast_load_v16i8_v64i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bcst
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %bcst
 ;
 ; SSE42-LABEL: 'broadcast_load_v16i8_v64i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bcst
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %bcst
 ;
 ; AVX1-LABEL: 'broadcast_load_v16i8_v64i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bcst
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %bcst
 ;
 ; AVX2-LABEL: 'broadcast_load_v16i8_v64i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bcst
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX2-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %bcst
 ;
 ; AVX512-LABEL: 'broadcast_load_v16i8_v64i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %load = load <16 x i8>, ptr %src, align 16
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bcst
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load = load <16 x i8>, ptr %src, align 16
+; AVX512-NEXT:  Cost Model: Found costs of 0 for: %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %bcst
 ;
   %load = load <16 x i8>, ptr %src
   %bcst = shufflevector <16 x i8> %load, <16 x i8> poison, <64 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index 3031629..aea7cc8 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -2032,7 +2032,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2040,7 +2040,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2048,7 +2048,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2056,7 +2056,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2091,7 +2091,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
@@ -2099,7 +2099,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
@@ -2107,7 +2107,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
@@ -2115,7 +2115,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 8fae37b3..742b5b2 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -2032,7 +2032,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2040,7 +2040,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2048,7 +2048,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2056,7 +2056,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
@@ -2091,7 +2091,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
@@ -2099,7 +2099,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
@@ -2107,7 +2107,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; SKL-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
@@ -2115,7 +2115,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll
index a8ceebf..c4cf8b7 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll
@@ -3,11 +3,11 @@
 
 define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
 ; CHECK-LABEL: 'test_vXf16'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll
index 1a2d098..79e1622 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -25,20 +25,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
@@ -56,20 +56,20 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -89,22 +89,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer
@@ -125,22 +125,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -154,41 +154,41 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf16'
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
@@ -244,49 +244,49 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> zeroinitializer
@@ -302,54 +302,54 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> zeroinitializer
@@ -365,46 +365,46 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
@@ -440,22 +440,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 2, i32 2>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 2, i32 2>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 2, i32 2>
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 2, i32 2>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-load.ll b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll
index ccfb543..8bb9cbc 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-load.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll
@@ -32,21 +32,21 @@ define void @shuffle_load() {
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
 ; SSE-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
-; SSE-NEXT:  Cost Model: Found costs of 3 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
-; SSE-NEXT:  Cost Model: Found costs of 3 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
-; SSE-NEXT:  Cost Model: Found costs of 3 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
@@ -66,11 +66,11 @@ define void @shuffle_load() {
 ; SSE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf16 = load <4 x half>, ptr undef, align 8
 ; SSE-NEXT:  Cost Model: Found costs of 0 for: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xf16 = load <8 x half>, ptr undef, align 16
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xf16 = load <16 x half>, ptr undef, align 32
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xf16 = load <32 x half>, ptr undef, align 64
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xf32 = load <2 x float>, ptr undef, align 8
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
 ; SSE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf32 = load <4 x float>, ptr undef, align 16
@@ -95,21 +95,21 @@ define void @shuffle_load() {
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
@@ -129,11 +129,11 @@ define void @shuffle_load() {
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf16 = load <4 x half>, ptr undef, align 8
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xf16 = load <8 x half>, ptr undef, align 16
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xf16 = load <16 x half>, ptr undef, align 32
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xf16 = load <32 x half>, ptr undef, align 64
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xf32 = load <2 x float>, ptr undef, align 8
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf32 = load <4 x float>, ptr undef, align 16
@@ -158,21 +158,21 @@ define void @shuffle_load() {
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
 ; SSE3-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
-; SSE3-NEXT:  Cost Model: Found costs of 3 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
-; SSE3-NEXT:  Cost Model: Found costs of 3 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
-; SSE3-NEXT:  Cost Model: Found costs of 3 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
 ; SSE3-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
 ; SSE3-NEXT:  Cost Model: Found costs of 1 for: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
-; SSE3-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
-; SSE3-NEXT:  Cost Model: Found costs of 2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
-; SSE3-NEXT:  Cost Model: Found costs of 2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
 ; SSE3-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
@@ -192,11 +192,11 @@ define void @shuffle_load() {
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf16 = load <4 x half>, ptr undef, align 8
 ; SSE3-NEXT:  Cost Model: Found costs of 0 for: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xf16 = load <8 x half>, ptr undef, align 16
-; SSE3-NEXT:  Cost Model: Found costs of 2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xf16 = load <16 x half>, ptr undef, align 32
-; SSE3-NEXT:  Cost Model: Found costs of 2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xf16 = load <32 x half>, ptr undef, align 64
-; SSE3-NEXT:  Cost Model: Found costs of 2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xf32 = load <2 x float>, ptr undef, align 8
 ; SSE3-NEXT:  Cost Model: Found costs of 1 for: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
 ; SSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf32 = load <4 x float>, ptr undef, align 16
@@ -221,21 +221,21 @@ define void @shuffle_load() {
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
 ; AVX-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
@@ -255,11 +255,11 @@ define void @shuffle_load() {
 ; AVX-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf16 = load <4 x half>, ptr undef, align 8
 ; AVX-NEXT:  Cost Model: Found costs of 0 for: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xf16 = load <8 x half>, ptr undef, align 16
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xf16 = load <16 x half>, ptr undef, align 32
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xf16 = load <32 x half>, ptr undef, align 64
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xf32 = load <2 x float>, ptr undef, align 8
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf32 = load <4 x float>, ptr undef, align 16
@@ -284,21 +284,21 @@ define void @shuffle_load() {
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
 ; AVX2-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
-; AVX2-NEXT:  Cost Model: Found costs of 3 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
-; AVX2-NEXT:  Cost Model: Found costs of 3 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
-; AVX2-NEXT:  Cost Model: Found costs of 3 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
@@ -318,11 +318,11 @@ define void @shuffle_load() {
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf16 = load <4 x half>, ptr undef, align 8
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xf16 = load <8 x half>, ptr undef, align 16
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xf16 = load <16 x half>, ptr undef, align 32
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xf16 = load <32 x half>, ptr undef, align 64
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xf32 = load <2 x float>, ptr undef, align 8
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf32 = load <4 x float>, ptr undef, align 16
@@ -347,21 +347,21 @@ define void @shuffle_load() {
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
 ; AVX512-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
-; AVX512-NEXT:  Cost Model: Found costs of 3 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
-; AVX512-NEXT:  Cost Model: Found costs of 3 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
-; AVX512-NEXT:  Cost Model: Found costs of 3 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
@@ -381,11 +381,11 @@ define void @shuffle_load() {
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf16 = load <4 x half>, ptr undef, align 8
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_8xf16 = load <8 x half>, ptr undef, align 16
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %ld_16xf16 = load <16 x half>, ptr undef, align 32
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %ld_32xf16 = load <32 x half>, ptr undef, align 64
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_2xf32 = load <2 x float>, ptr undef, align 8
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %ld_4xf32 = load <4 x float>, ptr undef, align 16
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
index 4affc7e..f47e1a3 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
@@ -3,11 +3,11 @@
 
 define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
 ; CHECK-LABEL: 'test_vXf16'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:4 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll
index 1bcf2d5..aca0ba3 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll
@@ -25,20 +25,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:5 SizeLat:3 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -56,20 +56,20 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:5 SizeLat:3 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
@@ -89,22 +89,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
@@ -125,22 +125,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
@@ -154,65 +154,65 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 6 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 12 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:6 SizeLat:6 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:12 SizeLat:12 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:5 Lat:9 SizeLat:5 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:10 Lat:18 SizeLat:10 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:9 SizeLat:4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:18 SizeLat:8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:9 SizeLat:4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:4 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:4 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
@@ -227,71 +227,71 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-LABEL: 'test_vXi8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 9 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 9 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 18 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 36 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:11 Lat:6 SizeLat:11 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:11 Lat:6 SizeLat:11 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:22 Lat:12 SizeLat:22 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:44 Lat:24 SizeLat:44 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:8 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:8 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:5 Lat:9 SizeLat:5 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:10 Lat:18 SizeLat:10 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:9 SizeLat:4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:18 SizeLat:8 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:9 SizeLat:4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:9 SizeLat:4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:9 SizeLat:3 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -320,22 +320,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 2>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 2>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 2>
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 2>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll
index 007aa59bc..21d5766 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll
@@ -3,15 +3,15 @@
 
 define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %src512, <64 x half> %src1024) {
 ; CHECK-LABEL: 'test_vXf16'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll
index 3727d7b..11f14df 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src.ll
@@ -21,42 +21,42 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 0, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 13, i32 3, i32 11, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 5 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 11 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 0, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 13, i32 3, i32 11, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 11 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 0, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 13, i32 3, i32 11, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 10 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 0, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 13, i32 3, i32 11, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 0, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 13, i32 3, i32 11, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-  %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
+  %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 0, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 13, i32 3, i32 11, i32 1, i32 0>
   ret void
 }
 
@@ -64,79 +64,93 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 5 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 5 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
   %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 5, i32 1, i32 0>
   ret void
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of 11 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 11 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 12 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 12 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64  = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
   %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
   ret void
 }
 
@@ -144,48 +158,48 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found costs of 10 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 0, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 11 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; SSE-NEXT:  Cost Model: Found costs of 18 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 12 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 0, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 12 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; XOP-NEXT:  Cost Model: Found costs of 26 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 14 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 0, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 12 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX1-NEXT:  Cost Model: Found costs of 26 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 0, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 9 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX2-NEXT:  Cost Model: Found costs of 20 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 0, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64  = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
-  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 0, i32 1, i32 0>
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+  %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret void
 }
 
@@ -193,90 +207,90 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:24 Lat:24 SizeLat:24 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:26 Lat:26 SizeLat:26 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:5 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:24 Lat:24 SizeLat:24 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:56 Lat:56 SizeLat:56 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:94 Lat:94 SizeLat:94 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; SSSE3-NEXT:  Cost Model: Found costs of 21 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 39 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; SSE42-NEXT:  Cost Model: Found costs of 21 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 35 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 16 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; XOP-NEXT:  Cost Model: Found costs of 18 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 63 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 16 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 20 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 15 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX1-NEXT:  Cost Model: Found costs of 30 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 99 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 10 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX2-NEXT:  Cost Model: Found costs of 14 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 44 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 14 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 21 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX512F-NEXT:  Cost Model: Found costs of 14 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 84 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
   %V64  = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>
-  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 0, i32 1, i32 0>
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 4, i32 15, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 3, i32 6, i32 5, i32 14, i32 7, i32 2, i32 1, i32 16>
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
   ret void
 }
 
@@ -286,88 +300,88 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found costs of 5 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:19 Lat:19 SizeLat:19 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:37 Lat:37 SizeLat:37 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:91 Lat:91 SizeLat:91 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 6 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 21 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 6 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 21 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
-; XOP-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; XOP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 18 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 12 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 30 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 14 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 14 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 14 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi8'
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi8'
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
   %V32  = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 1>
   %V64  = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 7, i32 7, i32 5, i32 5, i32 3, i32 3, i32 1, i32 1>
   %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 8, i32 30, i32 20, i32 28, i32 27, i32 15, i32 13, i32 24, i32 11, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 26, i32 15, i32 14, i32 25, i32 12, i32 11, i32 23, i32 9, i32 31, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 0, i32 62, i32 61, i32 60, i32 7, i32 58, i32 57, i32 55, i32 56, i32 54, i32 53, i32 52, i32 1, i32 50, i32 47, i32 48, i32 49, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 11, i32 37, i32 36, i32 33, i32 34, i32 35, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 23, i32 24, i32 20, i32 35, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 42, i32 12, i32 34, i32 10, i32 8, i32 9, i32 24, i32 6, i32 5, i32 4, i32 29, i32 2, i32 1, i32 0>
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
index 6d22546..c98efcd 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
@@ -25,20 +25,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -56,20 +56,20 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
@@ -89,22 +89,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 1>
@@ -125,22 +125,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
@@ -154,41 +154,41 @@ define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128
 ; SSE2-LABEL: 'test_vXf16'
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf16'
 ; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXf16'
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf16'
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf16'
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 1>
@@ -244,49 +244,49 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-LABEL: 'test_vXi16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32  = shufflevector <2 x i16> %src32, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
@@ -302,54 +302,54 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V16  = shufflevector <2 x i8> %src16, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
@@ -365,46 +365,46 @@ define void @test_vXi1(<2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1>
 ; SSE2-LABEL: 'test_vXi1'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi1'
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSSE3-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSSE3-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi1'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi1'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi1'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V2 = shufflevector <2 x i1> %src2, <2 x i1> undef, <2 x i32> <i32 1, i32 1>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V4 = shufflevector <4 x i1> %src4, <4 x i1> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8 = shufflevector <8 x i1> %src8, <8 x i1> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V16 = shufflevector <16 x i1> %src16, <16 x i1> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V32 = shufflevector <32 x i1> %src32, <32 x i1> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V64 = shufflevector <64 x i1> %src64, <64 x i1> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi1'
@@ -440,22 +440,22 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
 ; AVX1-LABEL: 'test_upper_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_upper_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_upper_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
index 56afc01..3016316 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -22,41 +22,41 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; SSE-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; SSE-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; XOP-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found costs of 12 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 16 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXf64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found costs of 12 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 16 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found costs of 11 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 16 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
   %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
   %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-  %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
   ret void
 }
 
@@ -65,41 +65,41 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 ; SSE-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; SSE-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; SSE-NEXT:  Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; XOP-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found costs of 12 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 16 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX1-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found costs of 12 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 16 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found costs of 11 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 16 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
   %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
   %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-  %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 28, i32 13, i32 10, i32 18, i32 8, i32 8, i32 24, i32 31, i32 4, i32 9, i32 2, i32 8, i32 0>
   ret void
 }
 
@@ -107,129 +107,113 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 14 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of 22 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 14 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 22 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 7 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 13 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 9 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 20 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; XOP-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 17 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 27 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 17 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 27 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 13 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 21 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
   %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-  %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-  %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+  %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
   ret void
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE2-LABEL: 'test_vXi32'
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of 14 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSSE3-LABEL: 'test_vXi32'
-; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 14 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'test_vXi32'
-; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 13 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'test_vXi32'
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of 23 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
+; SSE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; XOP-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 17 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 27 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 17 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 27 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 13 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 8 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 21 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:9 SizeLat:3 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
   %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-  %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-  %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 11, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 20, i32 53, i32 22, i32 20, i32 19, i32 18, i32 40, i32 16, i32 15, i32 48, i32 13, i32 40, i32 11, i32 11, i32 9, i32 45, i32 33, i32 11, i32 5, i32 4, i32 35, i32 2, i32 33, i32 0>
   ret void
 }
 
@@ -238,89 +222,89 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:11 Lat:11 SizeLat:11 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:36 Lat:36 SizeLat:36 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:24 Lat:24 SizeLat:24 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:51 Lat:51 SizeLat:51 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:69 CodeSize:91 Lat:91 SizeLat:91 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 12 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 21 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 37 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 10 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 12 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 19 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 33 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 18 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 26 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 30 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 54 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 15 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 30 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 42 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 15 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 42 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 90 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 14 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 20 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 17 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 42 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 42 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 84 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 42 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 168 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 4 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 8 for: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
   %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
   %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-  %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 29, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 22, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 41, i32 40, i32 39, i32 39, i32 42, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 0, i32 1, i32 5, i32 6, i32 2, i32 66, i32 2, i32 6, i32 7>
   ret void
 }
 
@@ -330,8 +314,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found costs of 4 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found costs of 7 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:13 Lat:13 SizeLat:13 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:41 Lat:41 SizeLat:41 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:42 Lat:42 SizeLat:42 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:78 Lat:78 SizeLat:78 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -339,8 +323,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 12 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; SSSE3-NEXT:  Cost Model: Found costs of 18 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -348,8 +332,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 4 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found costs of 6 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 10 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; SSE42-NEXT:  Cost Model: Found costs of 18 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -357,8 +341,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of 1 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found costs of 13 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 9 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; XOP-NEXT:  Cost Model: Found costs of 36 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -366,8 +350,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 15 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found costs of 23 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 15 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX1-NEXT:  Cost Model: Found costs of 54 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -375,8 +359,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found costs of 11 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX2-NEXT:  Cost Model: Found costs of 23 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
@@ -384,8 +368,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found costs of 42 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX512F-NEXT:  Cost Model: Found costs of 42 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi8'
@@ -393,8 +377,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of 3 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found costs of 19 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 7 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found costs of 19 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi8'
@@ -402,15 +386,15 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found costs of 2 for: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V16 = shufflevector <2 x i8> %src16, <2 x i8> %src16_1, <2 x i32> <i32 3, i32 0>
   %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
   %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
   %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-  %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 33, i32 20, i32 27, i32 28, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 7, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 40, i32 55, i32 11, i32 9, i32 45, i32 4, i32 11, i32 4, i32 5, i32 35, i32 2, i32 33, i32 0>
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 124, i32 62, i32 71, i32 127, i32 58, i32 60, i32 55, i32 56, i32 57, i32 54, i32 127, i32 52, i32 50, i32 51, i32 49, i32 48, i32 47, i32 45, i32 44, i32 44, i32 100, i32 42, i32 41, i32 39, i32 39, i32 40, i32 37, i32 38, i32 35, i32 34, i32 32, i32 33, i32 31, i32 27, i32 30, i32 28, i32 29, i32 26, i32 25, i32 24, i32 23, i32 99, i32 21, i32 20, i32 19, i32 18, i32 17, i32 72, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 5, i32 6, i32 4, i32 66, i32 2, i32 1, i32 0>
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll
index bd642791..c53da90 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll
@@ -531,31 +531,31 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:7 SizeLat:13 for: %shift = ashr <4 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v4i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:6 SizeLat:9 for: %shift = ashr <4 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:7 SizeLat:13 for: %shift = ashr <4 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:6 SizeLat:9 for: %shift = ashr <4 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v4i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = ashr <4 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
@@ -574,31 +574,31 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:20 Lat:14 SizeLat:26 for: %shift = ashr <8 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:12 SizeLat:18 for: %shift = ashr <8 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:20 Lat:14 SizeLat:26 for: %shift = ashr <8 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:12 SizeLat:18 for: %shift = ashr <8 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = ashr <8 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
@@ -660,31 +660,31 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <8 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <8 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <8 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <8 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <8 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
@@ -703,31 +703,31 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <16 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <16 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <16 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <16 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
@@ -740,43 +740,43 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v8i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v8i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
@@ -789,43 +789,43 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = ashr <16 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = ashr <16 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <16 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <16 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
@@ -838,67 +838,67 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = ashr <32 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = ashr <32 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <32 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <32 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i16'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
@@ -911,67 +911,67 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:15 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:15 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %shift = ashr <16 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v16i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:12 SizeLat:12 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:12 SizeLat:12 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:12 SizeLat:12 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
@@ -984,67 +984,67 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:18 Lat:30 SizeLat:26 for: %shift = ashr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:18 Lat:30 SizeLat:26 for: %shift = ashr <32 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:16 Lat:11 SizeLat:21 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:6 for: %shift = ashr <32 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:13 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:13 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:13 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
@@ -1057,67 +1057,67 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:36 Lat:60 SizeLat:52 for: %shift = ashr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v64i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:36 Lat:60 SizeLat:52 for: %shift = ashr <64 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v64i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:32 Lat:22 SizeLat:42 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v64i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:22 Lat:18 SizeLat:32 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:10 Lat:14 SizeLat:12 for: %shift = ashr <64 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:22 Lat:18 SizeLat:32 for: %shift = ashr <64 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v64i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:51 Lat:37 SizeLat:63 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:15 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:51 Lat:37 SizeLat:63 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:15 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:15 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index fdf011e..7a06269 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -531,31 +531,31 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:7 SizeLat:13 for: %shift = ashr <4 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v4i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:6 SizeLat:9 for: %shift = ashr <4 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:7 SizeLat:13 for: %shift = ashr <4 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:6 SizeLat:9 for: %shift = ashr <4 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v4i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = ashr <4 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
@@ -574,31 +574,31 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:20 Lat:14 SizeLat:26 for: %shift = ashr <8 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:12 SizeLat:18 for: %shift = ashr <8 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:20 Lat:14 SizeLat:26 for: %shift = ashr <8 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:12 SizeLat:18 for: %shift = ashr <8 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = ashr <8 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
@@ -660,31 +660,31 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <8 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <8 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <8 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <8 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <8 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
@@ -703,31 +703,31 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <16 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <16 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <16 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <16 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
@@ -740,43 +740,43 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v8i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v8i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = ashr <8 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
@@ -789,43 +789,43 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = ashr <16 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = ashr <16 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <16 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = ashr <16 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <16 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
@@ -838,67 +838,67 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = ashr <32 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = ashr <32 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = ashr <32 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = ashr <32 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i16'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = ashr <32 x i16> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
@@ -911,67 +911,67 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:15 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:15 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %shift = ashr <16 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v16i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:12 SizeLat:12 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:9 Lat:5 SizeLat:13 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:12 SizeLat:12 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:8 Lat:12 SizeLat:12 for: %shift = ashr <16 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
@@ -984,67 +984,67 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:18 Lat:30 SizeLat:26 for: %shift = ashr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:18 Lat:30 SizeLat:26 for: %shift = ashr <32 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:16 Lat:11 SizeLat:21 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:6 for: %shift = ashr <32 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:13 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:11 Lat:9 SizeLat:16 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:13 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:13 for: %shift = ashr <32 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
@@ -1057,67 +1057,67 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:36 Lat:60 SizeLat:52 for: %shift = ashr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v64i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:36 Lat:60 SizeLat:52 for: %shift = ashr <64 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v64i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:32 Lat:22 SizeLat:42 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v64i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:22 Lat:18 SizeLat:32 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:10 Lat:14 SizeLat:12 for: %shift = ashr <64 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:22 Lat:18 SizeLat:32 for: %shift = ashr <64 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v64i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:51 Lat:37 SizeLat:63 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:15 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:51 Lat:37 SizeLat:63 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:15 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:10 Lat:10 SizeLat:15 for: %shift = ashr <64 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll
index 2f70a4b..ee7546f 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll
@@ -527,31 +527,31 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = lshr <4 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v4i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <4 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = lshr <4 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <4 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v4i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <4 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
@@ -570,31 +570,31 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = lshr <8 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = lshr <8 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = lshr <8 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = lshr <8 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <8 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
@@ -656,31 +656,31 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <8 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <8 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <8 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <8 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <8 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
@@ -699,31 +699,31 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <16 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <16 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <16 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <16 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
@@ -736,43 +736,43 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v8i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v8i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
@@ -785,43 +785,43 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = lshr <16 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = lshr <16 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <16 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <16 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
@@ -834,67 +834,67 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = lshr <32 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = lshr <32 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <32 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <32 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i16'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
@@ -907,67 +907,67 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:5 Lat:13 SizeLat:9 for: %shift = lshr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:5 Lat:13 SizeLat:9 for: %shift = lshr <16 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:8 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %shift = lshr <16 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v16i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:10 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:10 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:10 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
@@ -980,43 +980,43 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:10 Lat:26 SizeLat:18 for: %shift = lshr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:10 Lat:26 SizeLat:18 for: %shift = lshr <32 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:10 Lat:9 SizeLat:14 for: %shift = lshr <32 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:9 for: %shift = lshr <32 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:6 for: %shift = lshr <32 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:9 for: %shift = lshr <32 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v32i8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:9 for: %shift = lshr <32 x i8> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
@@ -1029,67 +1029,67 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:52 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v64i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:52 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v64i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:20 Lat:18 SizeLat:28 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v64i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:14 Lat:16 SizeLat:18 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:10 Lat:14 SizeLat:12 for: %shift = lshr <64 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:14 Lat:16 SizeLat:18 for: %shift = lshr <64 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v64i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:19 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:10 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:19 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:10 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:10 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 78fd9f33..a508c2a 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -527,31 +527,31 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = lshr <4 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v4i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <4 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = lshr <4 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <4 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v4i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <4 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
@@ -570,31 +570,31 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = lshr <8 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = lshr <8 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = lshr <8 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = lshr <8 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = lshr <8 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
@@ -656,31 +656,31 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <8 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <8 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <8 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <8 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <8 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
@@ -699,31 +699,31 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <16 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <16 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <16 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <16 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
@@ -736,43 +736,43 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v8i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v8i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = lshr <8 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
@@ -785,43 +785,43 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = lshr <16 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = lshr <16 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <16 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = lshr <16 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <16 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
@@ -834,67 +834,67 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = lshr <32 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = lshr <32 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = lshr <32 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = lshr <32 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i16'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = lshr <32 x i16> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
@@ -907,67 +907,67 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:5 Lat:13 SizeLat:9 for: %shift = lshr <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:5 Lat:13 SizeLat:9 for: %shift = lshr <16 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:8 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %shift = lshr <16 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v16i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:10 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:9 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:10 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:10 SizeLat:8 for: %shift = lshr <16 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
@@ -980,43 +980,43 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:10 Lat:26 SizeLat:18 for: %shift = lshr <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:10 Lat:26 SizeLat:18 for: %shift = lshr <32 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:10 Lat:9 SizeLat:14 for: %shift = lshr <32 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:9 for: %shift = lshr <32 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:7 SizeLat:6 for: %shift = lshr <32 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:9 for: %shift = lshr <32 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v32i8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:9 for: %shift = lshr <32 x i8> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
@@ -1029,67 +1029,67 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:52 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v64i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:52 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v64i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:20 Lat:18 SizeLat:28 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v64i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:14 Lat:16 SizeLat:18 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:10 Lat:14 SizeLat:12 for: %shift = lshr <64 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:14 Lat:16 SizeLat:18 for: %shift = lshr <64 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v64i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:19 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:10 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:30 Lat:19 SizeLat:36 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:10 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:7 Lat:8 SizeLat:10 for: %shift = lshr <64 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll
index adedca8..375dac5 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll
@@ -567,31 +567,31 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = shl <4 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v4i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <4 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = shl <4 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <4 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v4i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> poison, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> poison, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <4 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
@@ -610,31 +610,31 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = shl <8 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = shl <8 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = shl <8 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = shl <8 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> poison, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <8 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
@@ -696,31 +696,31 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <8 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <8 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <8 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <8 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> poison, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <8 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
@@ -739,31 +739,31 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <16 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <16 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <16 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <16 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> poison, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
@@ -776,49 +776,49 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v8i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v8i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v8i16'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> poison, i16 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
@@ -831,49 +831,49 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = shl <16 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = shl <16 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <16 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <16 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v16i16'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> poison, i16 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> poison, <16 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = shl <16 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
@@ -886,73 +886,73 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = shl <32 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = shl <32 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <32 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <32 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <32 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <32 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i16'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v32i16'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = shl <32 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
@@ -965,49 +965,49 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:10 SizeLat:9 for: %shift = shl <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:10 SizeLat:9 for: %shift = shl <16 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:8 for: %shift = shl <16 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:7 for: %shift = shl <16 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %shift = shl <16 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:7 for: %shift = shl <16 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:7 for: %shift = shl <16 x i8> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v16i8'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> poison, i8 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> poison, <16 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:10 SizeLat:9 for: %shift = shl <16 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
@@ -1020,49 +1020,49 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:12 Lat:20 SizeLat:18 for: %shift = shl <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:12 Lat:20 SizeLat:18 for: %shift = shl <32 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:11 Lat:8 SizeLat:14 for: %shift = shl <32 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <32 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:7 SizeLat:6 for: %shift = shl <32 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <32 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v32i8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <32 x i8> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v32i8'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:12 Lat:20 SizeLat:18 for: %shift = shl <32 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
@@ -1075,73 +1075,73 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:24 Lat:40 SizeLat:36 for: %shift = shl <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v64i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:24 Lat:40 SizeLat:36 for: %shift = shl <64 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v64i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:22 Lat:16 SizeLat:28 for: %shift = shl <64 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v64i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:14 SizeLat:16 for: %shift = shl <64 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:12 for: %shift = shl <64 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:14 SizeLat:16 for: %shift = shl <64 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v64i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:27 Lat:19 SizeLat:33 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:27 Lat:19 SizeLat:33 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v64i8'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:24 Lat:40 SizeLat:36 for: %shift = shl <64 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index a3536ec..b2435be 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -567,31 +567,31 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v4i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = shl <4 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v4i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <4 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:7 SizeLat:6 for: %shift = shl <4 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v4i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <4 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v4i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <4 x i64> undef, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <4 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %shift
 ;
@@ -610,31 +610,31 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i64'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = shl <8 x i64> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i64'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = shl <8 x i64> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:14 SizeLat:12 for: %shift = shl <8 x i64> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i64'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:8 SizeLat:4 for: %shift = shl <8 x i64> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i64'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i64> undef, i64 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:2 for: %shift = shl <8 x i64> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i64> %shift
 ;
@@ -696,31 +696,31 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <8 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <8 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <8 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <8 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i32> undef, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <8 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %shift
 ;
@@ -739,31 +739,31 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i32'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <16 x i32> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i32'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <16 x i32> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <16 x i32> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i32'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <16 x i32> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i32'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i32> undef, i32 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i32> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i32> %shift
 ;
@@ -776,49 +776,49 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v8i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v8i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v8i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v8i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v8i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v8i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v8i16'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <8 x i16> undef, i16 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %shift = shl <8 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %shift
 ;
@@ -831,49 +831,49 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, i16 %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = shl <16 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = shl <16 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <16 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:7 SizeLat:7 for: %shift = shl <16 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i16'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <16 x i16> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v16i16'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i16> undef, i16 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %shift = shl <16 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %shift
 ;
@@ -886,73 +886,73 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i16'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:2 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = shl <32 x i16> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i16'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = shl <32 x i16> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i16'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <32 x i16> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i16'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <32 x i16> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:10 Lat:14 SizeLat:14 for: %shift = shl <32 x i16> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i16'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:4 Lat:8 SizeLat:6 for: %shift = shl <32 x i16> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v32i16'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:10 SizeLat:7 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v32i16'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %shift = shl <32 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:3 for: %shift = shl <32 x i16> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i16> %shift
 ;
@@ -965,49 +965,49 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v16i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:10 SizeLat:9 for: %shift = shl <16 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v16i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:10 SizeLat:9 for: %shift = shl <16 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v16i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:8 for: %shift = shl <16 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v16i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:7 for: %shift = shl <16 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %shift = shl <16 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v16i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:7 for: %shift = shl <16 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v16i8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:5 Lat:5 SizeLat:7 for: %shift = shl <16 x i8> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v16i8'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:10 SizeLat:9 for: %shift = shl <16 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %shift
 ;
@@ -1020,49 +1020,49 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v32i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:12 Lat:20 SizeLat:18 for: %shift = shl <32 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v32i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:12 Lat:20 SizeLat:18 for: %shift = shl <32 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v32i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:11 Lat:8 SizeLat:14 for: %shift = shl <32 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v32i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <32 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:7 SizeLat:6 for: %shift = shl <32 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v32i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <32 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; AVX512-LABEL: 'splatvar_shift_v32i8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <32 x i8> %a, %splat
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v32i8'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:12 Lat:20 SizeLat:18 for: %shift = shl <32 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %shift
 ;
@@ -1075,73 +1075,73 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SSE2-LABEL: 'splatvar_shift_v64i8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; SSE2-NEXT:  Cost Model: Found costs of 3 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:4 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:24 Lat:40 SizeLat:36 for: %shift = shl <64 x i8> %a, %splat
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SSE42-LABEL: 'splatvar_shift_v64i8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; SSE42-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:24 Lat:40 SizeLat:36 for: %shift = shl <64 x i8> %a, %splat
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX1-LABEL: 'splatvar_shift_v64i8'
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:22 Lat:16 SizeLat:28 for: %shift = shl <64 x i8> %a, %splat
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX2-LABEL: 'splatvar_shift_v64i8'
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:14 SizeLat:16 for: %shift = shl <64 x i8> %a, %splat
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX1-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX1-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; XOPAVX1-NEXT:  Cost Model: Found costs of 2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:4 SizeLat:6 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:14 SizeLat:12 for: %shift = shl <64 x i8> %a, %splat
 ; XOPAVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; XOPAVX2-LABEL: 'splatvar_shift_v64i8'
 ; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; XOPAVX2-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:14 SizeLat:16 for: %shift = shl <64 x i8> %a, %splat
 ; XOPAVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512F-LABEL: 'splatvar_shift_v64i8'
 ; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512F-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512F-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:27 Lat:19 SizeLat:33 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512F-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BW-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512BW-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BW-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512VL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512VL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512VL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:27 Lat:19 SizeLat:33 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512VL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512BWVL-LABEL: 'splatvar_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512BWVL-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; SLM-LABEL: 'splatvar_shift_v64i8'
 ; SLM-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; SLM-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; SLM-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:2 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:24 Lat:40 SizeLat:36 for: %shift = shl <64 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
 ; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
-; AVX512GFNI-NEXT:  Cost Model: Found costs of 1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:7 SizeLat:8 for: %shift = shl <64 x i8> %a, %splat
 ; AVX512GFNI-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <64 x i8> %shift
 ;
diff --git a/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll b/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll
index d330152..e0def90 100644
--- a/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/Banerjee.ll
@@ -113,7 +113,7 @@ define void @banerjee1(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: %2 = load i64, ptr %arrayidx6, align 8 --> Dst: store i64 %2, ptr %B.addr.12, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i64 %2, ptr %B.addr.12, align 8 --> Dst: store i64 %2, ptr %B.addr.12, align 8
-; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 ; NORMALIZE-LABEL: 'banerjee1'
 ; NORMALIZE-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
@@ -127,7 +127,7 @@ define void @banerjee1(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; NORMALIZE-NEXT:  Src: %2 = load i64, ptr %arrayidx6, align 8 --> Dst: store i64 %2, ptr %B.addr.12, align 8
 ; NORMALIZE-NEXT:    da analyze - confused!
 ; NORMALIZE-NEXT:  Src: store i64 %2, ptr %B.addr.12, align 8 --> Dst: store i64 %2, ptr %B.addr.12, align 8
-; NORMALIZE-NEXT:    da analyze - output [* *]!
+; NORMALIZE-NEXT:    da analyze - confused!
 ;
 ; DELIN-LABEL: 'banerjee1'
 ; DELIN-NEXT:  Src: store i64 0, ptr %arrayidx, align 8 --> Dst: store i64 0, ptr %arrayidx, align 8
@@ -141,7 +141,7 @@ define void @banerjee1(ptr %A, ptr %B, i64 %m, i64 %n) nounwind uwtable ssp {
 ; DELIN-NEXT:  Src: %2 = load i64, ptr %arrayidx6, align 8 --> Dst: store i64 %2, ptr %B.addr.12, align 8
 ; DELIN-NEXT:    da analyze - confused!
 ; DELIN-NEXT:  Src: store i64 %2, ptr %B.addr.12, align 8 --> Dst: store i64 %2, ptr %B.addr.12, align 8
-; DELIN-NEXT:    da analyze - output [* *]!
+; DELIN-NEXT:    da analyze - confused!
 ;
 entry:
   %cmp4 = icmp sgt i64 %n, 0
diff --git a/llvm/test/Analysis/DependenceAnalysis/FlipFlopBaseAddress.ll b/llvm/test/Analysis/DependenceAnalysis/FlipFlopBaseAddress.ll
index 3e3426a..bf2268b 100644
--- a/llvm/test/Analysis/DependenceAnalysis/FlipFlopBaseAddress.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/FlipFlopBaseAddress.ll
@@ -8,11 +8,11 @@
 define float @bug41488_test1(float %f) {
 ; CHECK-LABEL: 'bug41488_test1'
 ; CHECK-NEXT:  Src: %0 = load float, ptr %p, align 4 --> Dst: %0 = load float, ptr %p, align 4
-; CHECK-NEXT:    da analyze - input [*]!
+; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load float, ptr %p, align 4 --> Dst: store float %f, ptr %q, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store float %f, ptr %q, align 4 --> Dst: store float %f, ptr %q, align 4
-; CHECK-NEXT:    da analyze - output [*]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %g = alloca float, align 4
@@ -34,11 +34,11 @@ for.cond.cleanup:
 define void @bug41488_test2(i32 %n) {
 ; CHECK-LABEL: 'bug41488_test2'
 ; CHECK-NEXT:  Src: %0 = load float, ptr %p, align 4 --> Dst: %0 = load float, ptr %p, align 4
-; CHECK-NEXT:    da analyze - input [*]!
+; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load float, ptr %p, align 4 --> Dst: store float 0.000000e+00, ptr %q, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store float 0.000000e+00, ptr %q, align 4 --> Dst: store float 0.000000e+00, ptr %q, align 4
-; CHECK-NEXT:    da analyze - output [*]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %g = alloca float, align 4
@@ -68,7 +68,7 @@ define void @bug53942_foo(i32 noundef %n, ptr noalias nocapture noundef writeonl
 ; CHECK-NEXT:  Src: %.pre = load double, ptr %B, align 8 --> Dst: store double %.pre, ptr %arrayidx2, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store double %.pre, ptr %arrayidx2, align 8 --> Dst: store double %.pre, ptr %arrayidx2, align 8
-; CHECK-NEXT:    da analyze - output [*]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %cmp8 = icmp sgt i32 %n, 1
@@ -99,11 +99,11 @@ for.body:                                         ; preds = %for.body.preheader,
 define void @bug53942_bar(i32 noundef %n, ptr noalias noundef %A, ptr noalias noundef %B) {
 ; CHECK-LABEL: 'bug53942_bar'
 ; CHECK-NEXT:  Src: %0 = load double, ptr %arrayidx, align 8 --> Dst: %0 = load double, ptr %arrayidx, align 8
-; CHECK-NEXT:    da analyze - input [*]!
+; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load double, ptr %arrayidx, align 8 --> Dst: store double %0, ptr %arrayidx8, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store double %0, ptr %arrayidx8, align 8 --> Dst: store double %0, ptr %arrayidx8, align 8
-; CHECK-NEXT:    da analyze - output [*]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   br label %for.cond
@@ -166,14 +166,14 @@ for.end:                                          ; preds = %for.cond.cleanup
 ;       (j % 2 == 0 ? A[i][j] : A[i][j+1]) = 1;
 ; }
 ;
-; FIXME: There are loop-carried dependencies between the store instruction. For
+; There are loop-carried dependencies between the store instruction. For
 ; example, the value of %ptr0 when (i, j) = (0, 1) is %A+8, which is the same
 ; as when (i, j) = (0, 2).
 
 define void @non_invariant_baseptr_with_identical_obj(ptr %A) {
 ; CHECK-LABEL: 'non_invariant_baseptr_with_identical_obj'
 ; CHECK-NEXT:  Src: store i32 1, ptr %idx, align 4 --> Dst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   br label %loop.i.header
@@ -216,13 +216,13 @@ exit:
 ; Similar to the above case, but ptr0 is loop-invariant with respsect to the
 ; k-loop.
 ;
-; FIXME: Same as the above case, there are loop-carried dependencies between
-; the store.
+; Same as the above case, there are loop-carried dependencies between the
+; store.
 
 define void @non_invariant_baseptr_with_identical_obj2(ptr %A) {
 ; CHECK-LABEL: 'non_invariant_baseptr_with_identical_obj2'
 ; CHECK-NEXT:  Src: store i32 1, ptr %idx, align 4 --> Dst: store i32 1, ptr %idx, align 4
-; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   br label %loop.i.header
@@ -259,3 +259,58 @@ loop.i.latch:
 exit:
   ret void
 }
+
+; Pseudo-code that is approximately semantically equivalent to the below IR:
+;
+; void f(int A[][32]) {
+;   for (int i = 0; i < 100; i++)
+;     for (int j = 0; j < 15; j++) {
+;       int offset = (j % 2 == 0) ? 1 : 0;
+;       A[i][2 * j + offset + 0] = 1;
+;       A[i][2 * j + offset + 1] = 1;
+;     }
+; }
+;
+; There are loop-carried dependencies between the two stores. For example,
+; A[0][2] is accessed from both the former one when (i, j) = (0, 1) and the
+; latter one when (i, j) = (0, 0).
+;
+define void @non_invariant_baseptr_with_identical_obj3(ptr %A) {
+; CHECK-LABEL: 'non_invariant_baseptr_with_identical_obj3'
+; CHECK-NEXT:  Src: store i32 1, ptr %idx0, align 4 --> Dst: store i32 1, ptr %idx0, align 4
+; CHECK-NEXT:    da analyze - confused!
+; CHECK-NEXT:  Src: store i32 1, ptr %idx0, align 4 --> Dst: store i32 1, ptr %idx1, align 4
+; CHECK-NEXT:    da analyze - confused!
+; CHECK-NEXT:  Src: store i32 1, ptr %idx1, align 4 --> Dst: store i32 1, ptr %idx1, align 4
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %A1 = getelementptr i32, ptr %A, i32 1
+  br label %loop.j
+
+loop.j:
+  %j = phi i32 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+  %ptr0 = phi ptr [ %A1, %loop.i.header ], [ %ptr1, %loop.j ]
+  %ptr1 = phi ptr [ %A, %loop.i.header ], [ %ptr0, %loop.j ]
+  %j2_0 = shl i32 %j, 1
+  %j2_1 = add i32 %j2_0, 1
+  %idx0 = getelementptr [32 x i32], ptr %ptr0, i32 %i, i32 %j2_0
+  %idx1 = getelementptr [32 x i32], ptr %ptr0, i32 %i, i32 %j2_1
+  store i32 1, ptr %idx0
+  store i32 1, ptr %idx1
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, 15
+  br i1 %cmp.j, label %loop.j, label %loop.i.latch
+
+loop.i.latch:
+  %i.inc = add i32 %i, 1
+  %cmp.i = icmp slt i32 %i.inc, 100
+  br i1 %cmp.i, label %loop.i.header, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/GCD.ll b/llvm/test/Analysis/DependenceAnalysis/GCD.ll
index c0e1362..03343e7 100644
--- a/llvm/test/Analysis/DependenceAnalysis/GCD.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/GCD.ll
@@ -398,7 +398,7 @@ define void @gcd6(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: %2 = load i32, ptr %arrayidx9, align 4 --> Dst: store i32 %2, ptr %B.addr.12, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i32 %2, ptr %B.addr.12, align 4 --> Dst: store i32 %2, ptr %B.addr.12, align 4
-; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %cmp4 = icmp sgt i64 %n, 0
@@ -475,7 +475,7 @@ define void @gcd7(i32 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: %11 = load i32, ptr %arrayidx12, align 4 --> Dst: store i32 %11, ptr %B.addr.12, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i32 %11, ptr %B.addr.12, align 4 --> Dst: store i32 %11, ptr %B.addr.12, align 4
-; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %0 = zext i32 %n to i64
@@ -566,7 +566,7 @@ define void @gcd8(i32 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: %5 = load i32, ptr %arrayidx12, align 4 --> Dst: store i32 %5, ptr %B.addr.12, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i32 %5, ptr %B.addr.12, align 4 --> Dst: store i32 %5, ptr %B.addr.12, align 4
-; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %cmp4 = icmp sgt i32 %n, 0
@@ -650,7 +650,7 @@ define void @gcd9(i32 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: %11 = load i32, ptr %arrayidx12, align 4 --> Dst: store i32 %11, ptr %B.addr.12, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i32 %11, ptr %B.addr.12, align 4 --> Dst: store i32 %11, ptr %B.addr.12, align 4
-; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %0 = zext i32 %n to i64
diff --git a/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll b/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll
index d983bd4..3e110ac 100644
--- a/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/NonAffineExpr.ll
@@ -12,7 +12,7 @@ define void @f(ptr %a, i32 %n, i1 %arg) align 2 {
 ; CHECK-NEXT:  Src: %t.2 = load ptr, ptr %a, align 4 --> Dst: %t.4 = load i32, ptr %t.3, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %t.4 = load i32, ptr %t.3, align 4 --> Dst: %t.4 = load i32, ptr %t.3, align 4
-; CHECK-NEXT:    da analyze - input [* *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 for.preheader:
   %t.0 = ashr exact i32 %n, 3
diff --git a/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll b/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll
index 4ab8777..8cb0e2a 100644
--- a/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/Preliminary.ll
@@ -69,7 +69,7 @@ define void @p2(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: %0 = load i64, ptr %arrayidx17, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i64 %0, ptr %B.addr.24, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
-; CHECK-NEXT:    da analyze - output [* * *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %cmp10 = icmp sgt i64 %n, 0
diff --git a/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
index 4040187..e67cae7d 100644
--- a/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
@@ -28,7 +28,7 @@ define void @p2(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: %0 = load i64, ptr %arrayidx17, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i64 %0, ptr %B.addr.24, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
-; CHECK-NEXT:    da analyze - output [* * *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 ; LIN-LABEL: 'p2'
 ; LIN-NEXT:  Src: store i64 %i.011, ptr %arrayidx8, align 8 --> Dst: store i64 %i.011, ptr %arrayidx8, align 8
@@ -42,7 +42,7 @@ define void @p2(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; LIN-NEXT:  Src: %0 = load i64, ptr %arrayidx17, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
 ; LIN-NEXT:    da analyze - confused!
 ; LIN-NEXT:  Src: store i64 %0, ptr %B.addr.24, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
-; LIN-NEXT:    da analyze - output [* * *]!
+; LIN-NEXT:    da analyze - confused!
 ;
 entry:
   %cmp10 = icmp sgt i64 %n, 0
diff --git a/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll b/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
index f64a748..8b9aa25 100644
--- a/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
@@ -437,7 +437,7 @@ define void @symbolicrdiv6(ptr %A, ptr %B, i64 %n1, i64 %n2) nounwind uwtable ss
 ; CHECK-NEXT:  Src: %0 = load i32, ptr %arrayidx4, align 4 --> Dst: store i32 %0, ptr %B.addr.12, align 4
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.12, align 4 --> Dst: store i32 %0, ptr %B.addr.12, align 4
-; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:    da analyze - confused!
 ;
 entry:
   %cmp4 = icmp eq i64 %n1, 0
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/runtime-check-known-true.ll b/llvm/test/Analysis/LoopAccessAnalysis/runtime-check-known-true.ll
new file mode 100644
index 0000000..30c8088
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/runtime-check-known-true.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+; TODO: Accesses are known completely before or after.
+define void @test_runtime_check_known_false_after_construction(ptr %start.1, ptr %start.2, ptr %end) {
+; CHECK-LABEL: 'test_runtime_check_known_false_after_construction'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %ptr.iv.1 = phi ptr [ %ptr.iv.1.next, %loop ], [ %start.1, %entry ]
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %ptr.iv.2 = phi ptr [ %ptr.iv.2.next, %loop ], [ %start.2.diff, %entry ]
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: ((-8 * ((2305843009213693951 * (8 + (-1 * (ptrtoint ptr %start.1 to i64)) + (ptrtoint ptr %end to i64))) /u 8)) + %start.1) High: (8 + %start.1))
+; CHECK-NEXT:            Member: {%start.1,+,-8}<%loop>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: (-8 + (-8 * ((2305843009213693951 * (8 + (-1 * (ptrtoint ptr %start.1 to i64)) + (ptrtoint ptr %end to i64))) /u 8)) + (-1 * (ptrtoint ptr %start.2 to i64)) + (ptrtoint ptr %start.1 to i64) + %start.2) High: ((-1 * (ptrtoint ptr %start.2 to i64)) + (ptrtoint ptr %start.1 to i64) + %start.2))
+; CHECK-NEXT:            Member: {(-8 + (-1 * (ptrtoint ptr %start.2 to i64)) + (ptrtoint ptr %start.1 to i64) + %start.2),+,-8}<%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: (zext i3 ((trunc i64 (ptrtoint ptr %end to i64) to i3) + (-1 * (trunc i64 (ptrtoint ptr %start.1 to i64) to i3))) to i64) == 0
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.start.2 = getelementptr i8, ptr %start.2, i64 8
+  %start.1.int = ptrtoint ptr %start.1 to i64
+  %start.2.int = ptrtoint ptr %gep.start.2 to i64
+  %diff = sub i64 %start.1.int, %start.2.int
+  %start.2.diff = getelementptr i8, ptr %start.2, i64 %diff
+  br label %loop
+
+loop:
+  %ptr.iv.1 = phi ptr [ %ptr.iv.1.next, %loop ], [ %start.1, %entry ]
+  %ptr.iv.2 = phi ptr [ %ptr.iv.2.next, %loop ], [ %start.2.diff, %entry ]
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 -8
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 -8
+  %l = load i64, ptr %ptr.iv.2, align 8
+  store i64 %l, ptr %ptr.iv.1, align 8
+  %ec = icmp eq ptr %ptr.iv.2, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 3426b6f..3042b8f 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -19,6 +19,7 @@ llvm_canonicalize_cmake_booleans(
   LLVM_EXAMPLEIRTRANSFORMS_LINK_INTO_TOOLS
   LLVM_HAVE_TF_AOT
   LLVM_HAVE_TFLITE
+  LLVM_ENABLE_PROFCHECK
   LLVM_INLINER_MODEL_AUTOGENERATED
   LLVM_RAEVICT_MODEL_AUTOGENERATED
   LLVM_ENABLE_EXPENSIVE_CHECKS
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
index cf4f321..491d693 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
@@ -1,8 +1,8 @@
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
 #
 name:            f1_2s
 registers:
@@ -16,18 +16,18 @@ body:             |
     %2:fpr64 = COPY $d2
     %1:fpr64 = COPY $d1
     %0:fpr64 = COPY $d0
-    %3:fpr64 = FMULv2f32 %0, %1, implicit $fpcr
-    %4:fpr64 = FSUBv2f32 killed %3, %2, implicit $fpcr
+    %3:fpr64 = contract FMULv2f32 %0, %1, implicit $fpcr
+    %4:fpr64 = contract FSUBv2f32 killed %3, %2, implicit $fpcr
     $d0 = COPY %4
     RET_ReallyLR implicit $d0
 
 ...
 # UNPROFITABLE-LABEL: name: f1_2s
-# UNPROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
+# UNPROFITABLE: [[R1:%[0-9]+]]:fpr64 = contract FNEGv2f32 %2
 # UNPROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1, implicit $fpcr
 #
 # PROFITABLE-LABEL: name: f1_2s
-# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
+# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = contract FNEGv2f32 %2
 # PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1, implicit $fpcr
 ---
 name:            f1_4s
@@ -42,18 +42,18 @@ body:             |
     %2:fpr128 = COPY $q2
     %1:fpr128 = COPY $q1
     %0:fpr128 = COPY $q0
-    %3:fpr128 = FMULv4f32 %0, %1, implicit $fpcr
-    %4:fpr128 = FSUBv4f32 killed %3, %2, implicit $fpcr
+    %3:fpr128 = contract FMULv4f32 %0, %1, implicit $fpcr
+    %4:fpr128 = contract FSUBv4f32 killed %3, %2, implicit $fpcr
     $q0 = COPY %4
     RET_ReallyLR implicit $q0
 
 ...
 # UNPROFITABLE-LABEL: name: f1_4s
-# UNPROFITABLE: [[R1:%[0-9]+]]:fpr128 = FMULv4f32 %0, %1, implicit $fpcr
+# UNPROFITABLE: [[R1:%[0-9]+]]:fpr128 = contract FMULv4f32 %0, %1, implicit $fpcr
 # UNPROFITABLE-NEXT: FSUBv4f32 killed [[R1]], %2, implicit $fpcr
 #
 # PROFITABLE-LABEL: name: f1_4s
-# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
+# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = contract FNEGv4f32 %2
 # PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1, implicit $fpcr
 ---
 name:            f1_2d
@@ -68,18 +68,18 @@ body:             |
     %2:fpr128 = COPY $q2
     %1:fpr128 = COPY $q1
     %0:fpr128 = COPY $q0
-    %3:fpr128 = FMULv2f64 %0, %1, implicit $fpcr
-    %4:fpr128 = FSUBv2f64 killed %3, %2, implicit $fpcr
+    %3:fpr128 = contract FMULv2f64 %0, %1, implicit $fpcr
+    %4:fpr128 = contract FSUBv2f64 killed %3, %2, implicit $fpcr
     $q0 = COPY %4
     RET_ReallyLR implicit $q0
 
 ...
 # UNPROFITABLE-LABEL: name: f1_2d
-# UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1, implicit $fpcr
+# UNPROFITABLE: %3:fpr128 = contract FMULv2f64 %0, %1, implicit $fpcr
 # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2, implicit $fpcr
 #
 # PROFITABLE-LABEL: name: f1_2d
-# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2
+# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = contract FNEGv2f64 %2
 # PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1, implicit $fpcr
 ---
 name:            f1_both_fmul_2s
@@ -97,15 +97,15 @@ body:             |
     %2:fpr64 = COPY $q2
     %1:fpr64 = COPY $q1
     %0:fpr64 = COPY $q0
-    %4:fpr64 = FMULv2f32 %0, %1, implicit $fpcr
-    %5:fpr64 = FMULv2f32 %2, %3, implicit $fpcr
-    %6:fpr64 = FSUBv2f32 killed %4, %5, implicit $fpcr
+    %4:fpr64 = contract FMULv2f32 %0, %1, implicit $fpcr
+    %5:fpr64 = contract FMULv2f32 %2, %3, implicit $fpcr
+    %6:fpr64 = contract FSUBv2f32 killed %4, %5, implicit $fpcr
     $q0 = COPY %6
     RET_ReallyLR implicit $q0
 
 ...
 # ALL-LABEL: name: f1_both_fmul_2s
-# ALL: %4:fpr64 = FMULv2f32 %0, %1, implicit $fpcr
+# ALL: %4:fpr64 = contract FMULv2f32 %0, %1, implicit $fpcr
 # ALL-NEXT: FMLSv2f32 killed %4, %2, %3, implicit $fpcr
 ---
 name:            f1_both_fmul_4s
@@ -123,15 +123,15 @@ body:             |
     %2:fpr128 = COPY $q2
     %1:fpr128 = COPY $q1
     %0:fpr128 = COPY $q0
-    %4:fpr128 = FMULv4f32 %0, %1, implicit $fpcr
-    %5:fpr128 = FMULv4f32 %2, %3, implicit $fpcr
-    %6:fpr128 = FSUBv4f32 killed %4, %5, implicit $fpcr
+    %4:fpr128 = contract FMULv4f32 %0, %1, implicit $fpcr
+    %5:fpr128 = contract FMULv4f32 %2, %3, implicit $fpcr
+    %6:fpr128 = contract FSUBv4f32 killed %4, %5, implicit $fpcr
     $q0 = COPY %6
     RET_ReallyLR implicit $q0
 
 ...
 # ALL-LABEL: name: f1_both_fmul_4s
-# ALL: %4:fpr128 = FMULv4f32 %0, %1, implicit $fpcr
+# ALL: %4:fpr128 = contract FMULv4f32 %0, %1, implicit $fpcr
 # ALL-NEXT: FMLSv4f32 killed %4, %2, %3, implicit $fpcr
 ---
 name:            f1_both_fmul_2d
@@ -149,14 +149,14 @@ body:             |
     %2:fpr128 = COPY $q2
     %1:fpr128 = COPY $q1
     %0:fpr128 = COPY $q0
-    %4:fpr128 = FMULv2f64 %0, %1, implicit $fpcr
-    %5:fpr128 = FMULv2f64 %2, %3, implicit $fpcr
-    %6:fpr128 = FSUBv2f64 killed %4, %5, implicit $fpcr
+    %4:fpr128 = contract FMULv2f64 %0, %1, implicit $fpcr
+    %5:fpr128 = contract FMULv2f64 %2, %3, implicit $fpcr
+    %6:fpr128 = contract FSUBv2f64 killed %4, %5, implicit $fpcr
     $q0 = COPY %6
     RET_ReallyLR implicit $q0
 
 ...
 # ALL-LABEL: name: f1_both_fmul_2d
-# ALL: %4:fpr128 = FMULv2f64 %0, %1, implicit $fpcr
+# ALL: %4:fpr128 = contract FMULv2f64 %0, %1, implicit $fpcr
 # ALL-NEXT: FMLSv2f64 killed %4, %2, %3, implicit $fpcr
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
deleted file mode 100644
index 09eb18b..0000000
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir
+++ /dev/null
@@ -1,364 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s
-
----
-name:            split_loads_to_fpr128
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] 
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
-    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
-    %7:fpr128 = LD1i32 %6, 1, killed %2
-    %8:fpr128 = LD1i32 %7, 2, killed %3
-    %9:fpr128 = LD1i32 %8, 3, killed %4
-    $q0 = COPY %9
-    RET_ReallyLR implicit $q0
-
----
-name:            split_loads_to_fpr128_ui
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128_ui
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] 
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:fpr32 = LDRSui %0, 0
-    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
-    %7:fpr128 = LD1i32 %6, 1, killed %1
-    %8:fpr128 = LD1i32 %7, 2, killed %2
-    %9:fpr128 = LD1i32 %8, 3, killed %3
-    $q0 = COPY %9
-    RET_ReallyLR implicit $q0
-
----
-name:            split_loads_to_fpr128_i16
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128_i16
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
-    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]]
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]]
-    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]]
-    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:gpr64common = COPY $x5
-    %6:gpr64common = COPY $x6
-    %7:gpr64common = COPY $x7
-    %8:gpr64common = COPY $x8
-    %9:fpr16 = LDRHroX %0, killed %1, 0, 1
-    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
-    %11:fpr128 = LD1i16 %10, 1, killed %2
-    %12:fpr128 = LD1i16 %11, 2, killed %3
-    %13:fpr128 = LD1i16 %12, 3, killed %4
-    %14:fpr128 = LD1i16 %13, 4, killed %5
-    %15:fpr128 = LD1i16 %14, 5, killed %6
-    %16:fpr128 = LD1i16 %15, 6, killed %7
-    %17:fpr128 = LD1i16 %16, 7, killed %8
-    $q0 = COPY %17
-    RET_ReallyLR implicit $q0
-
----
-name:            split_loads_to_fpr128_i16_ui
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
-    ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]]
-    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]]
-    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]]
-    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:gpr64common = COPY $x5
-    %6:gpr64common = COPY $x6
-    %7:gpr64common = COPY $x7
-    %8:gpr64common = COPY $x8
-    %9:fpr16 = LDRHui %0, 0
-    %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub
-    %11:fpr128 = LD1i16 %10, 1, killed %1
-    %12:fpr128 = LD1i16 %11, 2, killed %2
-    %13:fpr128 = LD1i16 %12, 3, killed %3
-    %14:fpr128 = LD1i16 %13, 4, killed %4
-    %15:fpr128 = LD1i16 %14, 5, killed %5
-    %16:fpr128 = LD1i16 %15, 6, killed %6
-    %17:fpr128 = LD1i16 %16, 7, killed %7
-    $q0 = COPY %17
-    RET_ReallyLR implicit $q0
-
----
-name:            split_loads_to_fpr128_i8
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16
-
-    ; CHECK-LABEL: name: split_loads_to_fpr128_i8
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12
-    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13
-    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14
-    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15
-    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16
-    ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]]
-    ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]]
-    ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]]
-    ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]]
-    ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]]
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]]
-    ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]]
-    ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]]
-    ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]]
-    ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]]
-    ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]]
-    ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:gpr64common = COPY $x5
-    %6:gpr64common = COPY $x6
-    %7:gpr64common = COPY $x7
-    %8:gpr64common = COPY $x8
-    %9:gpr64common = COPY $x9
-    %10:gpr64common = COPY $x10
-    %11:gpr64common = COPY $x11
-    %12:gpr64common = COPY $x12
-    %13:gpr64common = COPY $x13
-    %14:gpr64common = COPY $x14
-    %15:gpr64common = COPY $x15
-    %16:gpr64common = COPY $x16
-    %17:fpr8 = LDRBroX %0, killed %1, 0, 0
-    %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub
-    %19:fpr128 = LD1i8 %18, 1, killed %2
-    %20:fpr128 = LD1i8 %19, 2, killed %3
-    %21:fpr128 = LD1i8 %20, 3, killed %4
-    %22:fpr128 = LD1i8 %21, 4, killed %5
-    %23:fpr128 = LD1i8 %22, 5, killed %6
-    %24:fpr128 = LD1i8 %23, 6, killed %7
-    %25:fpr128 = LD1i8 %24, 7, killed %8
-    %26:fpr128 = LD1i8 %25, 8, killed %9
-    %27:fpr128 = LD1i8 %26, 9, killed %10
-    %28:fpr128 = LD1i8 %27, 10, killed %11
-    %29:fpr128 = LD1i8 %28, 11, killed %12
-    %30:fpr128 = LD1i8 %29, 12, killed %13
-    %31:fpr128 = LD1i8 %30, 13, killed %14
-    %32:fpr128 = LD1i8 %31, 14, killed %15
-    %33:fpr128 = LD1i8 %32, 15, killed %16
-    $q0 = COPY %33
-    RET_ReallyLR implicit $q0
-
----
-name:            negative_pattern_missing_lanes
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: negative_pattern_missing_lanes
-    ; CHECK:      [[LD1:%.*]]:fpr128 = LDRQui $x1, 0
-    ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]]
-  
-    %0:gpr64common = COPY $x0
-    %1:fpr128 = LDRQui $x1, 0
-    %2:fpr128 = LD1i32 %1, 3, %0
-    $q0 = COPY %2
-    RET_ReallyLR implicit $q0
-
----
-name:            out_of_order_lanes
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4
-
-    ; CHECK-LABEL: name: out_of_order_lanes
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
-    ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] 
-    ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0
-    ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub
-    ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]]
-    ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]]
-    ; CHECK-NEXT: $q0 = COPY [[ZIP]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
-    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
-    %7:fpr128 = LD1i32 %6, 2, killed %2
-    %8:fpr128 = LD1i32 %7, 1, killed %3
-    %9:fpr128 = LD1i32 %8, 3, killed %4
-    $q0 = COPY %9
-    RET_ReallyLR implicit $q0
-
----
-name:            negative_pattern_no_subreg_to_reg
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
-    ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0
-    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]]
-    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]]
-    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:fpr128 = LDRQui %0, 0            
-    %5:fpr128 = LD1i32 %4, 1, killed %1 
-    %6:fpr128 = LD1i32 %5, 2, killed %2
-    %7:fpr128 = LD1i32 %6, 3, killed %3
-    $q0 = COPY %7
-    RET_ReallyLR implicit $q0
-
----
-name:            negative_pattern_multiple_users
-body:             |
-  bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3, $x4
-
-    ; CHECK-LABEL: name: negative_pattern_multiple_users
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4
-    ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1
-    ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub
-    ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]]
-    ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]]
-    ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]]
-    ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]]
-    ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]]
-    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
-    %0:gpr64common = COPY $x0
-    %1:gpr64common = COPY $x1
-    %2:gpr64common = COPY $x2
-    %3:gpr64common = COPY $x3
-    %4:gpr64common = COPY $x4
-    %5:fpr32 = LDRSroX %0, killed %1, 0, 1
-    %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub
-    %7:fpr128 = LD1i32 %6, 1, killed %2
-    %8:fpr128 = LD1i32 %7, 2, killed %3
-    %9:fpr128 = LD1i32 %8, 3, killed %4
-    $q0 = COPY %9
-    $q1 = COPY %8
-    RET_ReallyLR implicit $q0, implicit $q1
diff --git a/llvm/test/CodeGen/AArch64/andcompare.ll b/llvm/test/CodeGen/AArch64/andcompare.ll
index cbacd17..0e15b94 100644
--- a/llvm/test/CodeGen/AArch64/andcompare.ll
+++ b/llvm/test/CodeGen/AArch64/andcompare.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i32 @and_eq_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, eq
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, eq
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -27,21 +27,21 @@ entry:
 }
 
 define i32 @and_eq_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, eq
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, eq
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -51,21 +51,21 @@ entry:
 }
 
 define i32 @and_eq_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, eq
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, eq
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -75,21 +75,21 @@ entry:
 }
 
 define i32 @and_eq_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, eq
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, eq
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -99,21 +99,21 @@ entry:
 }
 
 define i32 @and_eq_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, eq
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, eq
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -123,21 +123,21 @@ entry:
 }
 
 define i32 @and_eq_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, eq
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, eq
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -147,21 +147,21 @@ entry:
 }
 
 define i32 @and_eq_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, eq
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, eq
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -171,21 +171,21 @@ entry:
 }
 
 define i32 @and_eq_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, eq
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, eq
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -195,21 +195,21 @@ entry:
 }
 
 define i32 @and_eq_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, eq
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, eq
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -219,21 +219,21 @@ entry:
 }
 
 define i32 @and_eq_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_eq_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, eq
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_eq_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, eq
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_eq_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -243,21 +243,21 @@ entry:
 }
 
 define i32 @and_ne_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ne
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ne
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -267,21 +267,21 @@ entry:
 }
 
 define i32 @and_ne_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, ne
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, ne
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -291,21 +291,21 @@ entry:
 }
 
 define i32 @and_ne_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, ne
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, ne
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -315,21 +315,21 @@ entry:
 }
 
 define i32 @and_ne_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, ne
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, ne
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -339,21 +339,21 @@ entry:
 }
 
 define i32 @and_ne_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ne
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ne
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -363,21 +363,21 @@ entry:
 }
 
 define i32 @and_ne_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ne
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ne
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -387,21 +387,21 @@ entry:
 }
 
 define i32 @and_ne_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ne
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ne
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -411,21 +411,21 @@ entry:
 }
 
 define i32 @and_ne_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ne
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ne
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -435,21 +435,21 @@ entry:
 }
 
 define i32 @and_ne_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, ne
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, ne
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -459,21 +459,21 @@ entry:
 }
 
 define i32 @and_ne_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ne_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, ne
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ne_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, ne
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ne_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -483,21 +483,21 @@ entry:
 }
 
 define i32 @and_ult_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lo
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lo
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -507,21 +507,21 @@ entry:
 }
 
 define i32 @and_ult_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, lo
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, lo
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -531,21 +531,21 @@ entry:
 }
 
 define i32 @and_ult_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, lo
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, lo
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -555,21 +555,21 @@ entry:
 }
 
 define i32 @and_ult_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, lo
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, lo
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -579,21 +579,21 @@ entry:
 }
 
 define i32 @and_ult_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lo
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lo
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -603,21 +603,21 @@ entry:
 }
 
 define i32 @and_ult_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lo
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lo
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -627,21 +627,21 @@ entry:
 }
 
 define i32 @and_ult_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lo
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lo
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -651,21 +651,21 @@ entry:
 }
 
 define i32 @and_ult_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lo
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lo
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -675,21 +675,21 @@ entry:
 }
 
 define i32 @and_ult_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, lo
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, lo
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -699,21 +699,21 @@ entry:
 }
 
 define i32 @and_ult_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ult_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, lo
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ult_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, lo
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ult_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -723,21 +723,21 @@ entry:
 }
 
 define i32 @and_ule_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ls
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ls
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -747,21 +747,21 @@ entry:
 }
 
 define i32 @and_ule_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, ls
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, ls
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -771,21 +771,21 @@ entry:
 }
 
 define i32 @and_ule_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, ls
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, ls
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -795,21 +795,21 @@ entry:
 }
 
 define i32 @and_ule_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, ls
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, ls
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -819,21 +819,21 @@ entry:
 }
 
 define i32 @and_ule_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ls
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ls
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -843,21 +843,21 @@ entry:
 }
 
 define i32 @and_ule_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ls
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ls
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -867,21 +867,21 @@ entry:
 }
 
 define i32 @and_ule_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ls
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ls
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -891,21 +891,21 @@ entry:
 }
 
 define i32 @and_ule_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ls
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ls
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -915,21 +915,21 @@ entry:
 }
 
 define i32 @and_ule_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, ls
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, ls
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -939,21 +939,21 @@ entry:
 }
 
 define i32 @and_ule_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ule_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, ls
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ule_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, ls
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ule_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -963,21 +963,21 @@ entry:
 }
 
 define i32 @and_ugt_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hi
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hi
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -987,21 +987,21 @@ entry:
 }
 
 define i32 @and_ugt_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, hi
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, hi
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -1011,21 +1011,21 @@ entry:
 }
 
 define i32 @and_ugt_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, hi
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, hi
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -1035,21 +1035,21 @@ entry:
 }
 
 define i32 @and_ugt_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, hi
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, hi
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -1059,21 +1059,21 @@ entry:
 }
 
 define i32 @and_ugt_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hi
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hi
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -1083,21 +1083,21 @@ entry:
 }
 
 define i32 @and_ugt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hi
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hi
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -1107,21 +1107,21 @@ entry:
 }
 
 define i32 @and_ugt_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hi
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hi
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -1131,21 +1131,21 @@ entry:
 }
 
 define i32 @and_ugt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hi
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hi
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -1155,21 +1155,21 @@ entry:
 }
 
 define i32 @and_ugt_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, hi
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, hi
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -1179,21 +1179,21 @@ entry:
 }
 
 define i32 @and_ugt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_ugt_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, hi
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_ugt_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, hi
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_ugt_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -1203,21 +1203,21 @@ entry:
 }
 
 define i32 @and_uge_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hs
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hs
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -1227,21 +1227,21 @@ entry:
 }
 
 define i32 @and_uge_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, hs
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, hs
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -1251,21 +1251,21 @@ entry:
 }
 
 define i32 @and_uge_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, hs
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, hs
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -1275,21 +1275,21 @@ entry:
 }
 
 define i32 @and_uge_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, hs
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, hs
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -1299,21 +1299,21 @@ entry:
 }
 
 define i32 @and_uge_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hs
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hs
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -1323,21 +1323,21 @@ entry:
 }
 
 define i32 @and_uge_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hs
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hs
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -1347,21 +1347,21 @@ entry:
 }
 
 define i32 @and_uge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hs
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hs
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -1371,21 +1371,21 @@ entry:
 }
 
 define i32 @and_uge_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hs
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hs
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -1395,21 +1395,21 @@ entry:
 }
 
 define i32 @and_uge_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, hs
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, hs
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -1419,21 +1419,21 @@ entry:
 }
 
 define i32 @and_uge_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_uge_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, hs
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_uge_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, hs
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_uge_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -1443,21 +1443,21 @@ entry:
 }
 
 define i32 @and_slt_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lt
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lt
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -1467,21 +1467,21 @@ entry:
 }
 
 define i32 @and_slt_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, lt
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, lt
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -1491,21 +1491,21 @@ entry:
 }
 
 define i32 @and_slt_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, lt
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, lt
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -1515,21 +1515,21 @@ entry:
 }
 
 define i32 @and_slt_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, lt
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, lt
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -1539,21 +1539,21 @@ entry:
 }
 
 define i32 @and_slt_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lt
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lt
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -1563,21 +1563,21 @@ entry:
 }
 
 define i32 @and_slt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lt
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lt
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -1587,21 +1587,21 @@ entry:
 }
 
 define i32 @and_slt_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lt
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lt
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -1611,21 +1611,21 @@ entry:
 }
 
 define i32 @and_slt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lt
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lt
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -1635,21 +1635,21 @@ entry:
 }
 
 define i32 @and_slt_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, lt
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, lt
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -1659,21 +1659,21 @@ entry:
 }
 
 define i32 @and_slt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_slt_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, lt
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_slt_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, lt
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_slt_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -1683,21 +1683,21 @@ entry:
 }
 
 define i32 @and_sle_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, le
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, le
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -1707,21 +1707,21 @@ entry:
 }
 
 define i32 @and_sle_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, le
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, le
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -1731,21 +1731,21 @@ entry:
 }
 
 define i32 @and_sle_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, le
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, le
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -1755,21 +1755,21 @@ entry:
 }
 
 define i32 @and_sle_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, le
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, le
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -1779,21 +1779,21 @@ entry:
 }
 
 define i32 @and_sle_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, le
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, le
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -1803,21 +1803,21 @@ entry:
 }
 
 define i32 @and_sle_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, le
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, le
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -1827,21 +1827,21 @@ entry:
 }
 
 define i32 @and_sle_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, le
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, le
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -1851,21 +1851,21 @@ entry:
 }
 
 define i32 @and_sle_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, le
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, le
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -1875,21 +1875,21 @@ entry:
 }
 
 define i32 @and_sle_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, le
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, le
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -1899,21 +1899,21 @@ entry:
 }
 
 define i32 @and_sle_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sle_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, le
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sle_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, le
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sle_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -1923,21 +1923,21 @@ entry:
 }
 
 define i32 @and_sgt_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, gt
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, gt
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -1947,21 +1947,21 @@ entry:
 }
 
 define i32 @and_sgt_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, gt
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, gt
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -1971,21 +1971,21 @@ entry:
 }
 
 define i32 @and_sgt_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, gt
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, gt
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -1995,21 +1995,21 @@ entry:
 }
 
 define i32 @and_sgt_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, gt
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, gt
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -2019,21 +2019,21 @@ entry:
 }
 
 define i32 @and_sgt_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, gt
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, gt
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -2043,21 +2043,21 @@ entry:
 }
 
 define i32 @and_sgt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, gt
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, gt
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -2067,21 +2067,21 @@ entry:
 }
 
 define i32 @and_sgt_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, gt
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, gt
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -2091,21 +2091,21 @@ entry:
 }
 
 define i32 @and_sgt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, gt
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, gt
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -2115,21 +2115,21 @@ entry:
 }
 
 define i32 @and_sgt_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, gt
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, gt
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -2139,21 +2139,21 @@ entry:
 }
 
 define i32 @and_sgt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sgt_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, gt
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sgt_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sgt_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, gt
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sgt_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -2163,21 +2163,21 @@ entry:
 }
 
 define i32 @and_sge_eq(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_eq:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ge
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_eq:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_eq:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ge
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_eq:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp eq i32 %s2, %s3
@@ -2187,21 +2187,21 @@ entry:
 }
 
 define i32 @and_sge_ne(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_ne:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, ge
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_ne:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_ne:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, ge
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_ne:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -2211,21 +2211,21 @@ entry:
 }
 
 define i32 @and_sge_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, ge
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, ge
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -2235,21 +2235,21 @@ entry:
 }
 
 define i32 @and_sge_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, ge
-; SDISEL-NEXT:    cset w0, ls
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, ge
+; CHECK-SD-NEXT:    cset w0, ls
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -2259,21 +2259,21 @@ entry:
 }
 
 define i32 @and_sge_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ge
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ge
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -2283,21 +2283,21 @@ entry:
 }
 
 define i32 @and_sge_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ge
-; SDISEL-NEXT:    cset w0, hs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ge
+; CHECK-SD-NEXT:    cset w0, hs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -2307,21 +2307,21 @@ entry:
 }
 
 define i32 @and_sge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ge
-; SDISEL-NEXT:    cset w0, lt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ge
+; CHECK-SD-NEXT:    cset w0, lt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -2331,21 +2331,21 @@ entry:
 }
 
 define i32 @and_sge_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, ge
-; SDISEL-NEXT:    cset w0, le
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, ge
+; CHECK-SD-NEXT:    cset w0, le
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -2355,21 +2355,21 @@ entry:
 }
 
 define i32 @and_sge_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #4, ge
-; SDISEL-NEXT:    cset w0, gt
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #4, ge
+; CHECK-SD-NEXT:    cset w0, gt
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
@@ -2379,21 +2379,21 @@ entry:
 }
 
 define i32 @and_sge_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3) {
-; SDISEL-LABEL: and_sge_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #8, ge
-; SDISEL-NEXT:    cset w0, ge
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_sge_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ge
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sge_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #8, ge
+; CHECK-SD-NEXT:    cset w0, ge
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_sge_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ge
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sge i32 %s0, %s1
   %c1 = icmp sge i32 %s2, %s3
@@ -2403,19 +2403,19 @@ entry:
 }
 
 define i32 @cmp_to_ands1(i32 %num) {
-; SDISEL-LABEL: cmp_to_ands1:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    and w8, w0, #0xff
-; SDISEL-NEXT:    tst w0, #0xfe
-; SDISEL-NEXT:    csel w0, w8, wzr, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: cmp_to_ands1:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    and w8, w0, #0xff
-; GISEL-NEXT:    cmp w8, #1
-; GISEL-NEXT:    csel w0, w8, wzr, hi
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_to_ands1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    tst w0, #0xfe
+; CHECK-SD-NEXT:    csel w0, w8, wzr, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cmp_to_ands1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    cmp w8, #1
+; CHECK-GI-NEXT:    csel w0, w8, wzr, hi
+; CHECK-GI-NEXT:    ret
   %and = and i32 %num, 255
   %cmp = icmp ugt i32 %and, 1
   %r = select i1 %cmp, i32 %and, i32 0
@@ -2423,19 +2423,19 @@ define i32 @cmp_to_ands1(i32 %num) {
 }
 
 define i32 @cmp_to_ands2(i32 %num) {
-; SDISEL-LABEL: cmp_to_ands2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    and w8, w0, #0xfe
-; SDISEL-NEXT:    tst w0, #0xc0
-; SDISEL-NEXT:    csel w0, w8, wzr, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: cmp_to_ands2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    and w8, w0, #0xfe
-; GISEL-NEXT:    cmp w8, #63
-; GISEL-NEXT:    csel w0, w8, wzr, hi
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_to_ands2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xfe
+; CHECK-SD-NEXT:    tst w0, #0xc0
+; CHECK-SD-NEXT:    csel w0, w8, wzr, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cmp_to_ands2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xfe
+; CHECK-GI-NEXT:    cmp w8, #63
+; CHECK-GI-NEXT:    csel w0, w8, wzr, hi
+; CHECK-GI-NEXT:    ret
   %and = and i32 %num, 254
   %cmp = icmp ugt i32 %and, 63
   %r = select i1 %cmp, i32 %and, i32 0
@@ -2443,19 +2443,19 @@ define i32 @cmp_to_ands2(i32 %num) {
 }
 
 define i32 @cmp_to_ands3(i32 %num, i32 %a) {
-; SDISEL-LABEL: cmp_to_ands3:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    tst w0, #0x10
-; SDISEL-NEXT:    csel w0, w1, wzr, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: cmp_to_ands3:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #23 // =0x17
-; GISEL-NEXT:    and w8, w0, w8
-; GISEL-NEXT:    cmp w8, #7
-; GISEL-NEXT:    csel w0, w1, wzr, hi
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_to_ands3:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    tst w0, #0x10
+; CHECK-SD-NEXT:    csel w0, w1, wzr, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cmp_to_ands3:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #23 // =0x17
+; CHECK-GI-NEXT:    and w8, w0, w8
+; CHECK-GI-NEXT:    cmp w8, #7
+; CHECK-GI-NEXT:    csel w0, w1, wzr, hi
+; CHECK-GI-NEXT:    ret
   %and = and i32 %num, 23
   %cmp = icmp ugt i32 %and, 7
   %r = select i1 %cmp, i32 %a, i32 0
@@ -2463,19 +2463,19 @@ define i32 @cmp_to_ands3(i32 %num, i32 %a) {
 }
 
 define i32 @cmp_to_ands4(i32 %num, i32 %a) {
-; SDISEL-LABEL: cmp_to_ands4:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    and w8, w0, #0x30
-; SDISEL-NEXT:    tst w0, #0x20
-; SDISEL-NEXT:    csel w0, w8, w1, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: cmp_to_ands4:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    and w8, w0, #0x30
-; GISEL-NEXT:    cmp w8, #31
-; GISEL-NEXT:    csel w0, w8, w1, ls
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_to_ands4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0x30
+; CHECK-SD-NEXT:    tst w0, #0x20
+; CHECK-SD-NEXT:    csel w0, w8, w1, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cmp_to_ands4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0x30
+; CHECK-GI-NEXT:    cmp w8, #31
+; CHECK-GI-NEXT:    csel w0, w8, w1, ls
+; CHECK-GI-NEXT:    ret
   %and = and i32 %num, 48
   %cmp = icmp ule i32 %and, 31
   %r = select i1 %cmp, i32 %and, i32 %a
@@ -2483,19 +2483,19 @@ define i32 @cmp_to_ands4(i32 %num, i32 %a) {
 }
 
 define i32 @cmp_to_ands5(i32 %num, i32 %a) {
-; SDISEL-LABEL: cmp_to_ands5:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    and w8, w0, #0xf8
-; SDISEL-NEXT:    tst w0, #0xc0
-; SDISEL-NEXT:    csel w0, w8, w1, eq
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: cmp_to_ands5:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    and w8, w0, #0xf8
-; GISEL-NEXT:    cmp w8, #64
-; GISEL-NEXT:    csel w0, w8, w1, lo
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_to_ands5:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xf8
+; CHECK-SD-NEXT:    tst w0, #0xc0
+; CHECK-SD-NEXT:    csel w0, w8, w1, eq
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cmp_to_ands5:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xf8
+; CHECK-GI-NEXT:    cmp w8, #64
+; CHECK-GI-NEXT:    csel w0, w8, w1, lo
+; CHECK-GI-NEXT:    ret
   %and = and i32 %num, 248
   %cmp = icmp ult i32 %and, 64
   %r = select i1 %cmp, i32 %and, i32 %a
@@ -2503,19 +2503,19 @@ define i32 @cmp_to_ands5(i32 %num, i32 %a) {
 }
 
 define i32 @cmp_to_ands6(i32 %num) {
-; SDISEL-LABEL: cmp_to_ands6:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    and w8, w0, #0xfe
-; SDISEL-NEXT:    tst w0, #0xf0
-; SDISEL-NEXT:    csel w0, w8, wzr, ne
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: cmp_to_ands6:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    and w8, w0, #0xfe
-; GISEL-NEXT:    cmp w8, #16
-; GISEL-NEXT:    csel w0, w8, wzr, hs
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_to_ands6:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xfe
+; CHECK-SD-NEXT:    tst w0, #0xf0
+; CHECK-SD-NEXT:    csel w0, w8, wzr, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cmp_to_ands6:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xfe
+; CHECK-GI-NEXT:    cmp w8, #16
+; CHECK-GI-NEXT:    csel w0, w8, wzr, hs
+; CHECK-GI-NEXT:    ret
   %and = and i32 %num, 254
   %cmp = icmp uge i32 %and, 16
   %r = select i1 %cmp, i32 %and, i32 0
@@ -2523,21 +2523,21 @@ define i32 @cmp_to_ands6(i32 %num) {
 }
 
 define i1 @and_fcmp(float %0, float %1) {
-; SDISEL-LABEL: and_fcmp:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    fcmp s1, s1
-; SDISEL-NEXT:    fccmp s0, s0, #0, vs
-; SDISEL-NEXT:    cset w0, vs
-; SDISEL-NEXT:    ret
-;
-; GISEL-LABEL: and_fcmp:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcmp s0, #0.0
-; GISEL-NEXT:    cset w8, vs
-; GISEL-NEXT:    fcmp s1, #0.0
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_fcmp:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcmp s1, s1
+; CHECK-SD-NEXT:    fccmp s0, s0, #0, vs
+; CHECK-SD-NEXT:    cset w0, vs
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: and_fcmp:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcmp s0, #0.0
+; CHECK-GI-NEXT:    cset w8, vs
+; CHECK-GI-NEXT:    fcmp s1, #0.0
+; CHECK-GI-NEXT:    cset w9, vs
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 
   %3 = fcmp uno float %0, 0.000000e+00
   %4 = fcmp uno float %1, 0.000000e+00
diff --git a/llvm/test/CodeGen/AArch64/andorbrcompare.ll b/llvm/test/CodeGen/AArch64/andorbrcompare.ll
index 951a5cd..5bc06ec 100644
--- a/llvm/test/CodeGen/AArch64/andorbrcompare.ll
+++ b/llvm/test/CodeGen/AArch64/andorbrcompare.ll
@@ -1,44 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare void @dummy()
 
 define i32 @and_eq_ne_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_eq_ne_ult:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #0, ne
-; SDISEL-NEXT:    b.eq .LBB0_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.lo .LBB0_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB0_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_eq_ne_ult:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #0, ne
+; CHECK-SD-NEXT:    b.eq .LBB0_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.lo .LBB0_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB0_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_eq_ne_ult:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB0_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.lo .LBB0_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB0_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_eq_ne_ult:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB0_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.lo .LBB0_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB0_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp eq i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -56,40 +56,40 @@ else:
 }
 
 define i32 @and_ne_ult_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_ne_ult_ule:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #4, lo
-; SDISEL-NEXT:    b.ne .LBB1_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.ls .LBB1_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB1_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ne_ult_ule:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #4, lo
+; CHECK-SD-NEXT:    b.ne .LBB1_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.ls .LBB1_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB1_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_ne_ult_ule:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB1_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.ls .LBB1_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB1_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_ne_ult_ule:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB1_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.ls .LBB1_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB1_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ne i32 %s0, %s1
   %c1 = icmp ult i32 %s2, %s3
@@ -107,40 +107,40 @@ else:
 }
 
 define i32 @and_ult_ule_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_ult_ule_ugt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #2, ls
-; SDISEL-NEXT:    b.lo .LBB2_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.hi .LBB2_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB2_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ult_ule_ugt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #2, ls
+; CHECK-SD-NEXT:    b.lo .LBB2_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.hi .LBB2_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB2_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_ult_ule_ugt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ls
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB2_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.hi .LBB2_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB2_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_ult_ule_ugt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ls
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB2_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.hi .LBB2_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB2_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ult i32 %s0, %s1
   %c1 = icmp ule i32 %s2, %s3
@@ -158,40 +158,40 @@ else:
 }
 
 define i32 @and_ule_ugt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_ule_ugt_uge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #2, hi
-; SDISEL-NEXT:    b.ls .LBB3_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.hs .LBB3_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB3_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ule_ugt_uge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #2, hi
+; CHECK-SD-NEXT:    b.ls .LBB3_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.hs .LBB3_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB3_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_ule_ugt_uge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB3_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.hs .LBB3_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB3_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_ule_ugt_uge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB3_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.hs .LBB3_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB3_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ule i32 %s0, %s1
   %c1 = icmp ugt i32 %s2, %s3
@@ -209,40 +209,40 @@ else:
 }
 
 define i32 @and_ugt_uge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_ugt_uge_slt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #0, hs
-; SDISEL-NEXT:    b.hi .LBB4_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.lt .LBB4_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB4_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_ugt_uge_slt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #0, hs
+; CHECK-SD-NEXT:    b.hi .LBB4_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.lt .LBB4_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB4_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_ugt_uge_slt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB4_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.lt .LBB4_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB4_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_ugt_uge_slt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB4_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.lt .LBB4_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB4_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp ugt i32 %s0, %s1
   %c1 = icmp uge i32 %s2, %s3
@@ -260,40 +260,40 @@ else:
 }
 
 define i32 @and_uge_slt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_uge_slt_sle:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #0, lt
-; SDISEL-NEXT:    b.hs .LBB5_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.le .LBB5_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB5_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_uge_slt_sle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #0, lt
+; CHECK-SD-NEXT:    b.hs .LBB5_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.le .LBB5_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB5_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_uge_slt_sle:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, lt
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB5_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.le .LBB5_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB5_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_uge_slt_sle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB5_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.le .LBB5_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB5_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp uge i32 %s0, %s1
   %c1 = icmp slt i32 %s2, %s3
@@ -311,40 +311,40 @@ else:
 }
 
 define i32 @and_slt_sle_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_slt_sle_sgt:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #0, le
-; SDISEL-NEXT:    b.lt .LBB6_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.gt .LBB6_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB6_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_slt_sle_sgt:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #0, le
+; CHECK-SD-NEXT:    b.lt .LBB6_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.gt .LBB6_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB6_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_slt_sle_sgt:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, le
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB6_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.gt .LBB6_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB6_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_slt_sle_sgt:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, le
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB6_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.gt .LBB6_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp slt i32 %s0, %s1
   %c1 = icmp sle i32 %s2, %s3
@@ -362,40 +362,40 @@ else:
 }
 
 define i32 @and_sle_sgt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, ptr %p) {
-; SDISEL-LABEL: and_sle_sgt_sge:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #0, gt
-; SDISEL-NEXT:    b.le .LBB7_3
-; SDISEL-NEXT:  // %bb.1: // %entry
-; SDISEL-NEXT:    cmp w4, w5
-; SDISEL-NEXT:    b.ge .LBB7_3
-; SDISEL-NEXT:  // %bb.2:
-; SDISEL-NEXT:    mov w0, wzr
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB7_3: // %if
-; SDISEL-NEXT:    mov w0, #1 // =0x1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: and_sle_sgt_sge:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #0, gt
+; CHECK-SD-NEXT:    b.le .LBB7_3
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    cmp w4, w5
+; CHECK-SD-NEXT:    b.ge .LBB7_3
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB7_3: // %if
+; CHECK-SD-NEXT:    mov w0, #1 // =0x1
+; CHECK-SD-NEXT:    str w0, [x6]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: and_sle_sgt_sge:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, le
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, gt
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    tbnz w8, #0, .LBB7_3
-; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    mov w0, wzr
-; GISEL-NEXT:    b.ge .LBB7_3
-; GISEL-NEXT:  // %bb.2: // %common.ret
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  .LBB7_3: // %if
-; GISEL-NEXT:    mov w0, #1 // =0x1
-; GISEL-NEXT:    str w0, [x6]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: and_sle_sgt_sge:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, le
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, gt
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB7_3
+; CHECK-GI-NEXT:  // %bb.1: // %entry
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    b.ge .LBB7_3
+; CHECK-GI-NEXT:  // %bb.2: // %common.ret
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB7_3: // %if
+; CHECK-GI-NEXT:    mov w0, #1 // =0x1
+; CHECK-GI-NEXT:    str w0, [x6]
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sle i32 %s0, %s1
   %c1 = icmp sgt i32 %s2, %s3
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 06e957f..a546ffd 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -debugify-and-strip-all-safe -mcpu=cyclone -verify-machineinstrs -aarch64-enable-ccmp -aarch64-stress-ccmp -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 target triple = "arm64-apple-ios"
 
 define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
@@ -32,31 +32,31 @@ if.end:
 
 ; Different condition codes for the two compares.
 define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
-; SDISEL-LABEL: single_different:
-; SDISEL:       ; %bb.0: ; %entry
-; SDISEL-NEXT:    cmp w0, #6
-; SDISEL-NEXT:    ccmp w1, #17, #0, ge
-; SDISEL-NEXT:    b.eq LBB1_2
-; SDISEL-NEXT:  ; %bb.1: ; %if.then
-; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; SDISEL-NEXT:    bl _foo
-; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:  LBB1_2: ; %if.end
-; SDISEL-NEXT:    mov w0, #7 ; =0x7
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: single_different:
+; CHECK-SD:       ; %bb.0: ; %entry
+; CHECK-SD-NEXT:    cmp w0, #6
+; CHECK-SD-NEXT:    ccmp w1, #17, #0, ge
+; CHECK-SD-NEXT:    b.eq LBB1_2
+; CHECK-SD-NEXT:  ; %bb.1: ; %if.then
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl _foo
+; CHECK-SD-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-SD-NEXT:  LBB1_2: ; %if.end
+; CHECK-SD-NEXT:    mov w0, #7 ; =0x7
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: single_different:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    cmp w0, #5
-; GISEL-NEXT:    ccmp w1, #17, #0, gt
-; GISEL-NEXT:    b.eq LBB1_2
-; GISEL-NEXT:  ; %bb.1: ; %if.then
-; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; GISEL-NEXT:    bl _foo
-; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:  LBB1_2: ; %if.end
-; GISEL-NEXT:    mov w0, #7 ; =0x7
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: single_different:
+; CHECK-GI:       ; %bb.0: ; %entry
+; CHECK-GI-NEXT:    cmp w0, #5
+; CHECK-GI-NEXT:    ccmp w1, #17, #0, gt
+; CHECK-GI-NEXT:    b.eq LBB1_2
+; CHECK-GI-NEXT:  ; %bb.1: ; %if.then
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl _foo
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-GI-NEXT:  LBB1_2: ; %if.end
+; CHECK-GI-NEXT:    mov w0, #7 ; =0x7
+; CHECK-GI-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, 5
   %cmp1 = icmp ne i32 %b, 17
@@ -73,41 +73,41 @@ if.end:
 
 ; Second block clobbers the flags, can't convert (easily).
 define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
-; SDISEL-LABEL: single_flagclobber:
-; SDISEL:       ; %bb.0: ; %entry
-; SDISEL-NEXT:    cmp w0, #5
-; SDISEL-NEXT:    b.eq LBB2_2
-; SDISEL-NEXT:  ; %bb.1: ; %lor.lhs.false
-; SDISEL-NEXT:    lsl w8, w1, #1
-; SDISEL-NEXT:    cmp w1, #7
-; SDISEL-NEXT:    csinc w8, w8, w1, lt
-; SDISEL-NEXT:    cmp w8, #16
-; SDISEL-NEXT:    b.gt LBB2_3
-; SDISEL-NEXT:  LBB2_2: ; %if.then
-; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; SDISEL-NEXT:    bl _foo
-; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:  LBB2_3: ; %if.end
-; SDISEL-NEXT:    mov w0, #7 ; =0x7
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: single_flagclobber:
+; CHECK-SD:       ; %bb.0: ; %entry
+; CHECK-SD-NEXT:    cmp w0, #5
+; CHECK-SD-NEXT:    b.eq LBB2_2
+; CHECK-SD-NEXT:  ; %bb.1: ; %lor.lhs.false
+; CHECK-SD-NEXT:    lsl w8, w1, #1
+; CHECK-SD-NEXT:    cmp w1, #7
+; CHECK-SD-NEXT:    csinc w8, w8, w1, lt
+; CHECK-SD-NEXT:    cmp w8, #16
+; CHECK-SD-NEXT:    b.gt LBB2_3
+; CHECK-SD-NEXT:  LBB2_2: ; %if.then
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl _foo
+; CHECK-SD-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-SD-NEXT:  LBB2_3: ; %if.end
+; CHECK-SD-NEXT:    mov w0, #7 ; =0x7
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: single_flagclobber:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    cmp w0, #5
-; GISEL-NEXT:    b.eq LBB2_2
-; GISEL-NEXT:  ; %bb.1: ; %lor.lhs.false
-; GISEL-NEXT:    lsl w8, w1, #1
-; GISEL-NEXT:    cmp w1, #7
-; GISEL-NEXT:    csinc w8, w8, w1, lt
-; GISEL-NEXT:    cmp w8, #17
-; GISEL-NEXT:    b.ge LBB2_3
-; GISEL-NEXT:  LBB2_2: ; %if.then
-; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; GISEL-NEXT:    bl _foo
-; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:  LBB2_3: ; %if.end
-; GISEL-NEXT:    mov w0, #7 ; =0x7
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: single_flagclobber:
+; CHECK-GI:       ; %bb.0: ; %entry
+; CHECK-GI-NEXT:    cmp w0, #5
+; CHECK-GI-NEXT:    b.eq LBB2_2
+; CHECK-GI-NEXT:  ; %bb.1: ; %lor.lhs.false
+; CHECK-GI-NEXT:    lsl w8, w1, #1
+; CHECK-GI-NEXT:    cmp w1, #7
+; CHECK-GI-NEXT:    csinc w8, w8, w1, lt
+; CHECK-GI-NEXT:    cmp w8, #17
+; CHECK-GI-NEXT:    b.ge LBB2_3
+; CHECK-GI-NEXT:  LBB2_2: ; %if.then
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl _foo
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-GI-NEXT:  LBB2_3: ; %if.end
+; CHECK-GI-NEXT:    mov w0, #7 ; =0x7
+; CHECK-GI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
   br i1 %cmp, label %if.then, label %lor.lhs.false
@@ -171,37 +171,37 @@ if.end:                                           ; preds = %if.then, %lor.lhs.f
 ; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
 ; safe to speculate.
 define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
-; SDISEL-LABEL: speculate_division:
-; SDISEL:       ; %bb.0: ; %entry
-; SDISEL-NEXT:    cmp w0, #1
-; SDISEL-NEXT:    sdiv w8, w1, w0
-; SDISEL-NEXT:    ccmp w8, #16, #0, ge
-; SDISEL-NEXT:    b.le LBB4_2
-; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7 ; =0x7
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  LBB4_2: ; %if.then
-; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; SDISEL-NEXT:    bl _foo
-; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7 ; =0x7
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: speculate_division:
+; CHECK-SD:       ; %bb.0: ; %entry
+; CHECK-SD-NEXT:    cmp w0, #1
+; CHECK-SD-NEXT:    sdiv w8, w1, w0
+; CHECK-SD-NEXT:    ccmp w8, #16, #0, ge
+; CHECK-SD-NEXT:    b.le LBB4_2
+; CHECK-SD-NEXT:  ; %bb.1: ; %if.end
+; CHECK-SD-NEXT:    mov w0, #7 ; =0x7
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  LBB4_2: ; %if.then
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl _foo
+; CHECK-SD-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, #7 ; =0x7
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: speculate_division:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    sdiv w8, w1, w0
-; GISEL-NEXT:    ccmp w8, #17, #0, gt
-; GISEL-NEXT:    b.lt LBB4_2
-; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7 ; =0x7
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  LBB4_2: ; %if.then
-; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; GISEL-NEXT:    bl _foo
-; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7 ; =0x7
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: speculate_division:
+; CHECK-GI:       ; %bb.0: ; %entry
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    sdiv w8, w1, w0
+; CHECK-GI-NEXT:    ccmp w8, #17, #0, gt
+; CHECK-GI-NEXT:    b.lt LBB4_2
+; CHECK-GI-NEXT:  ; %bb.1: ; %if.end
+; CHECK-GI-NEXT:    mov w0, #7 ; =0x7
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  LBB4_2: ; %if.then
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl _foo
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov w0, #7 ; =0x7
+; CHECK-GI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
   br i1 %cmp, label %land.lhs.true, label %if.end
@@ -221,41 +221,41 @@ if.end:
 
 ; Floating point compare.
 define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
-; SDISEL-LABEL: single_fcmp:
-; SDISEL:       ; %bb.0: ; %entry
-; SDISEL-NEXT:    cmp w0, #1
-; SDISEL-NEXT:    scvtf s1, w0
-; SDISEL-NEXT:    fdiv s0, s0, s1
-; SDISEL-NEXT:    fmov s1, #17.00000000
-; SDISEL-NEXT:    fccmp s0, s1, #8, ge
-; SDISEL-NEXT:    b.ge LBB5_2
-; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7 ; =0x7
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  LBB5_2: ; %if.then
-; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; SDISEL-NEXT:    bl _foo
-; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7 ; =0x7
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: single_fcmp:
+; CHECK-SD:       ; %bb.0: ; %entry
+; CHECK-SD-NEXT:    cmp w0, #1
+; CHECK-SD-NEXT:    scvtf s1, w0
+; CHECK-SD-NEXT:    fdiv s0, s0, s1
+; CHECK-SD-NEXT:    fmov s1, #17.00000000
+; CHECK-SD-NEXT:    fccmp s0, s1, #8, ge
+; CHECK-SD-NEXT:    b.ge LBB5_2
+; CHECK-SD-NEXT:  ; %bb.1: ; %if.end
+; CHECK-SD-NEXT:    mov w0, #7 ; =0x7
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  LBB5_2: ; %if.then
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl _foo
+; CHECK-SD-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov w0, #7 ; =0x7
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: single_fcmp:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    scvtf s1, w0
-; GISEL-NEXT:    fdiv s0, s0, s1
-; GISEL-NEXT:    fmov s1, #17.00000000
-; GISEL-NEXT:    fccmp s0, s1, #8, gt
-; GISEL-NEXT:    b.ge LBB5_2
-; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7 ; =0x7
-; GISEL-NEXT:    ret
-; GISEL-NEXT:  LBB5_2: ; %if.then
-; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; GISEL-NEXT:    bl _foo
-; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7 ; =0x7
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: single_fcmp:
+; CHECK-GI:       ; %bb.0: ; %entry
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    scvtf s1, w0
+; CHECK-GI-NEXT:    fdiv s0, s0, s1
+; CHECK-GI-NEXT:    fmov s1, #17.00000000
+; CHECK-GI-NEXT:    fccmp s0, s1, #8, gt
+; CHECK-GI-NEXT:    b.ge LBB5_2
+; CHECK-GI-NEXT:  ; %bb.1: ; %if.end
+; CHECK-GI-NEXT:    mov w0, #7 ; =0x7
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  LBB5_2: ; %if.then
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl _foo
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov w0, #7 ; =0x7
+; CHECK-GI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
   br i1 %cmp, label %land.lhs.true, label %if.end
@@ -499,28 +499,28 @@ define float @select_or_float(i32 %w0, i32 %w1, float %x2, float %x3) {
 }
 
 define i64 @gccbug(i64 %x0, i64 %x1) {
-; SDISEL-LABEL: gccbug:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmp x0, #2
-; SDISEL-NEXT:    ccmp x0, #4, #4, ne
-; SDISEL-NEXT:    ccmp x1, #0, #0, eq
-; SDISEL-NEXT:    mov w8, #1 ; =0x1
-; SDISEL-NEXT:    cinc x0, x8, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: gccbug:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmp x0, #2
+; CHECK-SD-NEXT:    ccmp x0, #4, #4, ne
+; CHECK-SD-NEXT:    ccmp x1, #0, #0, eq
+; CHECK-SD-NEXT:    mov w8, #1 ; =0x1
+; CHECK-SD-NEXT:    cinc x0, x8, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: gccbug:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp x1, #0
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    cmp x0, #4
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    orr w9, w10, w9
-; GISEL-NEXT:    and w8, w9, w8
-; GISEL-NEXT:    and x8, x8, #0x1
-; GISEL-NEXT:    add x0, x8, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: gccbug:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    cmp x1, #0
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmp x0, #2
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    cmp x0, #4
+; CHECK-GI-NEXT:    cset w10, eq
+; CHECK-GI-NEXT:    orr w9, w10, w9
+; CHECK-GI-NEXT:    and w8, w9, w8
+; CHECK-GI-NEXT:    and x8, x8, #0x1
+; CHECK-GI-NEXT:    add x0, x8, #1
+; CHECK-GI-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
   %cmp2 = icmp eq i64 %x0, 4
@@ -570,23 +570,23 @@ define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) {
 }
 
 define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
-; SDISEL-LABEL: select_andor32:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmp w1, w2
-; SDISEL-NEXT:    mov w8, #32 ; =0x20
-; SDISEL-NEXT:    ccmp w0, w8, #4, lt
-; SDISEL-NEXT:    ccmp w0, w1, #0, eq
-; SDISEL-NEXT:    csel w0, w0, w1, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: select_andor32:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmp w1, w2
+; CHECK-SD-NEXT:    mov w8, #32 ; =0x20
+; CHECK-SD-NEXT:    ccmp w0, w8, #4, lt
+; CHECK-SD-NEXT:    ccmp w0, w1, #0, eq
+; CHECK-SD-NEXT:    csel w0, w0, w1, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: select_andor32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #32 ; =0x20
-; GISEL-NEXT:    cmp w1, w2
-; GISEL-NEXT:    ccmp w0, w8, #4, lt
-; GISEL-NEXT:    ccmp w0, w1, #0, eq
-; GISEL-NEXT:    csel w0, w0, w1, eq
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: select_andor32:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    mov w8, #32 ; =0x20
+; CHECK-GI-NEXT:    cmp w1, w2
+; CHECK-GI-NEXT:    ccmp w0, w8, #4, lt
+; CHECK-GI-NEXT:    ccmp w0, w1, #0, eq
+; CHECK-GI-NEXT:    csel w0, w0, w1, eq
+; CHECK-GI-NEXT:    ret
   %c0 = icmp eq i32 %v1, %v2
   %c1 = icmp sge i32 %v2, %v3
   %c2 = icmp eq i32 %v1, 32
@@ -597,22 +597,22 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 }
 
 define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
-; SDISEL-LABEL: select_noccmp1:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmp x0, #0
-; SDISEL-NEXT:    ccmp x0, #13, #4, lt
-; SDISEL-NEXT:    cset w8, gt
-; SDISEL-NEXT:    cmp x2, #2
-; SDISEL-NEXT:    ccmp x2, #4, #4, lt
-; SDISEL-NEXT:    csinc w8, w8, wzr, le
-; SDISEL-NEXT:    cmp w8, #0
-; SDISEL-NEXT:    csel x0, xzr, x3, ne
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: select_noccmp1:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmp x0, #0
+; CHECK-SD-NEXT:    ccmp x0, #13, #4, lt
+; CHECK-SD-NEXT:    cset w8, gt
+; CHECK-SD-NEXT:    cmp x2, #2
+; CHECK-SD-NEXT:    ccmp x2, #4, #4, lt
+; CHECK-SD-NEXT:    csinc w8, w8, wzr, le
+; CHECK-SD-NEXT:    cmp w8, #0
+; CHECK-SD-NEXT:    csel x0, xzr, x3, ne
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: select_noccmp1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x0, x3
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: select_noccmp1:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    mov x0, x3
+; CHECK-GI-NEXT:    ret
   %c0 = icmp slt i64 %v1, 0
   %c1 = icmp sgt i64 %v1, 13
   %c2 = icmp slt i64 %v3, 2
@@ -627,28 +627,28 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 @g = global i32 0
 
 define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
-; SDISEL-LABEL: select_noccmp2:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmp x0, #0
-; SDISEL-NEXT:    ccmp x0, #13, #0, ge
-; SDISEL-NEXT:    cset w8, gt
-; SDISEL-NEXT:    cmp w8, #0
-; SDISEL-NEXT:    csel x0, xzr, x3, ne
-; SDISEL-NEXT:    sbfx w8, w8, #0, #1
-; SDISEL-NEXT:    adrp x9, _g@PAGE
-; SDISEL-NEXT:    str w8, [x9, _g@PAGEOFF]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: select_noccmp2:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmp x0, #0
+; CHECK-SD-NEXT:    ccmp x0, #13, #0, ge
+; CHECK-SD-NEXT:    cset w8, gt
+; CHECK-SD-NEXT:    cmp w8, #0
+; CHECK-SD-NEXT:    csel x0, xzr, x3, ne
+; CHECK-SD-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-SD-NEXT:    adrp x9, _g@PAGE
+; CHECK-SD-NEXT:    str w8, [x9, _g@PAGEOFF]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: select_noccmp2:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmp x0, #14
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, xzr, x3, ne
-; GISEL-NEXT:    sbfx w8, w8, #0, #1
-; GISEL-NEXT:    adrp x9, _g@PAGE
-; GISEL-NEXT:    str w8, [x9, _g@PAGEOFF]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: select_noccmp2:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    cmp x0, #14
+; CHECK-GI-NEXT:    cset w8, hs
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    csel x0, xzr, x3, ne
+; CHECK-GI-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-GI-NEXT:    adrp x9, _g@PAGE
+; CHECK-GI-NEXT:    str w8, [x9, _g@PAGEOFF]
+; CHECK-GI-NEXT:    ret
   %c0 = icmp slt i64 %v1, 0
   %c1 = icmp sgt i64 %v1, 13
   %or = or i1 %c0, %c1
@@ -661,33 +661,33 @@ define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 ; The following is not possible to implement with a single cmp;ccmp;csel
 ; sequence.
 define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) {
-; SDISEL-LABEL: select_noccmp3:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmp w0, #0
-; SDISEL-NEXT:    ccmp w0, #13, #0, ge
-; SDISEL-NEXT:    cset w8, gt
-; SDISEL-NEXT:    cmp w0, #22
-; SDISEL-NEXT:    mov w9, #44 ; =0x2c
-; SDISEL-NEXT:    ccmp w0, w9, #0, ge
-; SDISEL-NEXT:    csel w8, wzr, w8, le
-; SDISEL-NEXT:    cmp w0, #99
-; SDISEL-NEXT:    mov w9, #77 ; =0x4d
-; SDISEL-NEXT:    ccmp w0, w9, #4, ne
-; SDISEL-NEXT:    cset w9, eq
-; SDISEL-NEXT:    tst w8, w9
-; SDISEL-NEXT:    csel w0, w1, w2, ne
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: select_noccmp3:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    ccmp w0, #13, #0, ge
+; CHECK-SD-NEXT:    cset w8, gt
+; CHECK-SD-NEXT:    cmp w0, #22
+; CHECK-SD-NEXT:    mov w9, #44 ; =0x2c
+; CHECK-SD-NEXT:    ccmp w0, w9, #0, ge
+; CHECK-SD-NEXT:    csel w8, wzr, w8, le
+; CHECK-SD-NEXT:    cmp w0, #99
+; CHECK-SD-NEXT:    mov w9, #77 ; =0x4d
+; CHECK-SD-NEXT:    ccmp w0, w9, #4, ne
+; CHECK-SD-NEXT:    cset w9, eq
+; CHECK-SD-NEXT:    tst w8, w9
+; CHECK-SD-NEXT:    csel w0, w1, w2, ne
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: select_noccmp3:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #99 ; =0x63
-; GISEL-NEXT:    sub w9, w0, #45
-; GISEL-NEXT:    cmp w0, #77
-; GISEL-NEXT:    ccmp w0, w8, #4, ne
-; GISEL-NEXT:    ccmn w9, #23, #2, eq
-; GISEL-NEXT:    ccmp w0, #14, #0, lo
-; GISEL-NEXT:    csel w0, w1, w2, hs
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: select_noccmp3:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    mov w8, #99 ; =0x63
+; CHECK-GI-NEXT:    sub w9, w0, #45
+; CHECK-GI-NEXT:    cmp w0, #77
+; CHECK-GI-NEXT:    ccmp w0, w8, #4, ne
+; CHECK-GI-NEXT:    ccmn w9, #23, #2, eq
+; CHECK-GI-NEXT:    ccmp w0, #14, #0, lo
+; CHECK-GI-NEXT:    csel w0, w1, w2, hs
+; CHECK-GI-NEXT:    ret
   %c0 = icmp slt i32 %v0, 0
   %c1 = icmp sgt i32 %v0, 13
   %c2 = icmp slt i32 %v0, 22
@@ -864,27 +864,27 @@ define i32 @select_or_olt_ueq_ogt(double %v0, double %v1, double %v2, double %v3
 ; Verify that we correctly promote f16.
 
 define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32 %a, i32 %b) #0 {
-; SDISEL-LABEL: half_select_and_olt_oge:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    fcvt s1, h1
-; SDISEL-NEXT:    fcvt s0, h0
-; SDISEL-NEXT:    fcmp s0, s1
-; SDISEL-NEXT:    fcvt s0, h3
-; SDISEL-NEXT:    fcvt s1, h2
-; SDISEL-NEXT:    fccmp s1, s0, #8, mi
-; SDISEL-NEXT:    csel w0, w0, w1, ge
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: half_select_and_olt_oge:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    fcvt s1, h1
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    fcmp s0, s1
+; CHECK-SD-NEXT:    fcvt s0, h3
+; CHECK-SD-NEXT:    fcvt s1, h2
+; CHECK-SD-NEXT:    fccmp s1, s0, #8, mi
+; CHECK-SD-NEXT:    csel w0, w0, w1, ge
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: half_select_and_olt_oge:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcvt s0, h0
-; GISEL-NEXT:    fcvt s1, h1
-; GISEL-NEXT:    fcvt s2, h2
-; GISEL-NEXT:    fcvt s3, h3
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fccmp s2, s3, #8, mi
-; GISEL-NEXT:    csel w0, w0, w1, ge
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: half_select_and_olt_oge:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    fcvt s2, h2
+; CHECK-GI-NEXT:    fcvt s3, h3
+; CHECK-GI-NEXT:    fcmp s0, s1
+; CHECK-GI-NEXT:    fccmp s2, s3, #8, mi
+; CHECK-GI-NEXT:    csel w0, w0, w1, ge
+; CHECK-GI-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp oge half %v2, %v3
   %cr = and i1 %c1, %c0
@@ -893,29 +893,29 @@ define i32 @half_select_and_olt_oge(half %v0, half %v1, half %v2, half %v3, i32
 }
 
 define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32 %a, i32 %b) #0 {
-; SDISEL-LABEL: half_select_and_olt_one:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    fcvt s1, h1
-; SDISEL-NEXT:    fcvt s0, h0
-; SDISEL-NEXT:    fcmp s0, s1
-; SDISEL-NEXT:    fcvt s0, h3
-; SDISEL-NEXT:    fcvt s1, h2
-; SDISEL-NEXT:    fccmp s1, s0, #4, mi
-; SDISEL-NEXT:    fccmp s1, s0, #1, ne
-; SDISEL-NEXT:    csel w0, w0, w1, vc
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: half_select_and_olt_one:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    fcvt s1, h1
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    fcmp s0, s1
+; CHECK-SD-NEXT:    fcvt s0, h3
+; CHECK-SD-NEXT:    fcvt s1, h2
+; CHECK-SD-NEXT:    fccmp s1, s0, #4, mi
+; CHECK-SD-NEXT:    fccmp s1, s0, #1, ne
+; CHECK-SD-NEXT:    csel w0, w0, w1, vc
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: half_select_and_olt_one:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    fcvt s0, h0
-; GISEL-NEXT:    fcvt s1, h1
-; GISEL-NEXT:    fcvt s2, h2
-; GISEL-NEXT:    fcvt s3, h3
-; GISEL-NEXT:    fcmp s0, s1
-; GISEL-NEXT:    fccmp s2, s3, #4, mi
-; GISEL-NEXT:    fccmp s2, s3, #1, ne
-; GISEL-NEXT:    csel w0, w0, w1, vc
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: half_select_and_olt_one:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    fcvt s1, h1
+; CHECK-GI-NEXT:    fcvt s2, h2
+; CHECK-GI-NEXT:    fcvt s3, h3
+; CHECK-GI-NEXT:    fcmp s0, s1
+; CHECK-GI-NEXT:    fccmp s2, s3, #4, mi
+; CHECK-GI-NEXT:    fccmp s2, s3, #1, ne
+; CHECK-GI-NEXT:    csel w0, w0, w1, vc
+; CHECK-GI-NEXT:    ret
   %c0 = fcmp olt half %v0, %v1
   %c1 = fcmp one half %v2, %v3
   %cr = and i1 %c1, %c0
@@ -926,51 +926,51 @@ define i32 @half_select_and_olt_one(half %v0, half %v1, half %v2, half %v3, i32
 ; Also verify that we don't try to generate f128 FCCMPs, using RT calls instead.
 
 define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) #0 {
-; SDISEL-LABEL: f128_select_and_olt_oge:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    sub sp, sp, #80
-; SDISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
-; SDISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
-; SDISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
-; SDISEL-NEXT:    mov x19, x1
-; SDISEL-NEXT:    mov x20, x0
-; SDISEL-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
-; SDISEL-NEXT:    bl ___lttf2
-; SDISEL-NEXT:    cmp w0, #0
-; SDISEL-NEXT:    cset w21, lt
-; SDISEL-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
-; SDISEL-NEXT:    bl ___getf2
-; SDISEL-NEXT:    cmp w0, #0
-; SDISEL-NEXT:    cset w8, ge
-; SDISEL-NEXT:    tst w8, w21
-; SDISEL-NEXT:    csel w0, w20, w19, ne
-; SDISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
-; SDISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
-; SDISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
-; SDISEL-NEXT:    add sp, sp, #80
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: f128_select_and_olt_oge:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov x19, x1
+; CHECK-SD-NEXT:    mov x20, x0
+; CHECK-SD-NEXT:    stp q2, q3, [sp] ; 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl ___lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w21, lt
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] ; 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl ___getf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    cset w8, ge
+; CHECK-SD-NEXT:    tst w8, w21
+; CHECK-SD-NEXT:    csel w0, w20, w19, ne
+; CHECK-SD-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: f128_select_and_olt_oge:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #80
-; GISEL-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp q3, q2, [sp] ; 32-byte Folded Spill
-; GISEL-NEXT:    mov x19, x0
-; GISEL-NEXT:    mov x20, x1
-; GISEL-NEXT:    bl ___lttf2
-; GISEL-NEXT:    mov x21, x0
-; GISEL-NEXT:    ldp q1, q0, [sp] ; 32-byte Folded Reload
-; GISEL-NEXT:    bl ___getf2
-; GISEL-NEXT:    cmp w21, #0
-; GISEL-NEXT:    ccmp w0, #0, #8, lt
-; GISEL-NEXT:    csel w0, w19, w20, ge
-; GISEL-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
-; GISEL-NEXT:    add sp, sp, #80
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: f128_select_and_olt_oge:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp q3, q2, [sp] ; 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x1
+; CHECK-GI-NEXT:    bl ___lttf2
+; CHECK-GI-NEXT:    mov x21, x0
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] ; 32-byte Folded Reload
+; CHECK-GI-NEXT:    bl ___getf2
+; CHECK-GI-NEXT:    cmp w21, #0
+; CHECK-GI-NEXT:    ccmp w0, #0, #8, lt
+; CHECK-GI-NEXT:    csel w0, w19, w20, ge
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
   %c0 = fcmp olt fp128 %v0, %v1
   %c1 = fcmp oge fp128 %v2, %v3
   %cr = and i1 %c1, %c0
@@ -1048,46 +1048,46 @@ define i32 @deep_or2(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %x, i32 %y) {
 ; This test is trying to test that multiple ccmp's don't get created in a way
 ; that they would have multiple uses. It doesn't seem to.
 define i32 @multiccmp(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %x, i32 %y) #0 {
-; SDISEL-LABEL: multiccmp:
-; SDISEL:       ; %bb.0: ; %entry
-; SDISEL-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
-; SDISEL-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
-; SDISEL-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
-; SDISEL-NEXT:    mov x19, x5
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    cset w20, gt
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    cset w21, ne
-; SDISEL-NEXT:    tst w20, w21
-; SDISEL-NEXT:    csel w0, w5, w4, ne
-; SDISEL-NEXT:    bl _callee
-; SDISEL-NEXT:    tst w20, w21
-; SDISEL-NEXT:    csel w0, w0, w19, ne
-; SDISEL-NEXT:    bl _callee
-; SDISEL-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
-; SDISEL-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
-; SDISEL-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: multiccmp:
+; CHECK-SD:       ; %bb.0: ; %entry
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    cset w20, gt
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    cset w21, ne
+; CHECK-SD-NEXT:    tst w20, w21
+; CHECK-SD-NEXT:    csel w0, w5, w4, ne
+; CHECK-SD-NEXT:    bl _callee
+; CHECK-SD-NEXT:    tst w20, w21
+; CHECK-SD-NEXT:    csel w0, w0, w19, ne
+; CHECK-SD-NEXT:    bl _callee
+; CHECK-SD-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: multiccmp:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
-; GISEL-NEXT:    mov x19, x5
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w20, w8, w9
-; GISEL-NEXT:    tst w20, #0x1
-; GISEL-NEXT:    csel w0, w5, w4, ne
-; GISEL-NEXT:    bl _callee
-; GISEL-NEXT:    tst w20, #0x1
-; GISEL-NEXT:    csel w0, w0, w19, ne
-; GISEL-NEXT:    bl _callee
-; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: multiccmp:
+; CHECK-GI:       ; %bb.0: ; %entry
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov x19, x5
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w20, w8, w9
+; CHECK-GI-NEXT:    tst w20, #0x1
+; CHECK-GI-NEXT:    csel w0, w5, w4, ne
+; CHECK-GI-NEXT:    bl _callee
+; CHECK-GI-NEXT:    tst w20, #0x1
+; CHECK-GI-NEXT:    csel w0, w0, w19, ne
+; CHECK-GI-NEXT:    bl _callee
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -1100,57 +1100,57 @@ entry:
 }
 
 define i32 @multiccmp2(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %x, i32 %y) #0 {
-; SDISEL-LABEL: multiccmp2:
-; SDISEL:       ; %bb.0: ; %entry
-; SDISEL-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
-; SDISEL-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
-; SDISEL-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
-; SDISEL-NEXT:    mov x19, x5
-; SDISEL-NEXT:    mov x20, x3
-; SDISEL-NEXT:    mov x21, x0
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    cset w8, gt
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    cset w22, ne
-; SDISEL-NEXT:    tst w8, w22
-; SDISEL-NEXT:    csel w0, w5, w4, ne
-; SDISEL-NEXT:    bl _callee
-; SDISEL-NEXT:    cmp w21, w20
-; SDISEL-NEXT:    cset w8, eq
-; SDISEL-NEXT:    tst w22, w8
-; SDISEL-NEXT:    csel w0, w0, w19, ne
-; SDISEL-NEXT:    bl _callee
-; SDISEL-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
-; SDISEL-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
-; SDISEL-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: multiccmp2:
+; CHECK-SD:       ; %bb.0: ; %entry
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov x19, x5
+; CHECK-SD-NEXT:    mov x20, x3
+; CHECK-SD-NEXT:    mov x21, x0
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    cset w8, gt
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    cset w22, ne
+; CHECK-SD-NEXT:    tst w8, w22
+; CHECK-SD-NEXT:    csel w0, w5, w4, ne
+; CHECK-SD-NEXT:    bl _callee
+; CHECK-SD-NEXT:    cmp w21, w20
+; CHECK-SD-NEXT:    cset w8, eq
+; CHECK-SD-NEXT:    tst w22, w8
+; CHECK-SD-NEXT:    csel w0, w0, w19, ne
+; CHECK-SD-NEXT:    bl _callee
+; CHECK-SD-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: multiccmp2:
-; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
-; GISEL-NEXT:    mov x19, x0
-; GISEL-NEXT:    mov x20, x3
-; GISEL-NEXT:    mov x21, x5
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w22, ne
-; GISEL-NEXT:    and w8, w8, w22
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w5, w4, ne
-; GISEL-NEXT:    bl _callee
-; GISEL-NEXT:    cmp w19, w20
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    and w8, w22, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w21, ne
-; GISEL-NEXT:    bl _callee
-; GISEL-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: multiccmp2:
+; CHECK-GI:       ; %bb.0: ; %entry
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; CHECK-GI-NEXT:    mov x19, x0
+; CHECK-GI-NEXT:    mov x20, x3
+; CHECK-GI-NEXT:    mov x21, x5
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w22, ne
+; CHECK-GI-NEXT:    and w8, w8, w22
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    csel w0, w5, w4, ne
+; CHECK-GI-NEXT:    bl _callee
+; CHECK-GI-NEXT:    cmp w19, w20
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    and w8, w22, w8
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    csel w0, w0, w21, ne
+; CHECK-GI-NEXT:    bl _callee
+; CHECK-GI-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
 entry:
   %c0 = icmp sgt i32 %s0, %s1
   %c1 = icmp ne i32 %s2, %s3
@@ -1168,21 +1168,21 @@ entry:
 declare i32 @callee(i32)
 
 define i1 @cmp_and_negative_const(i32 %0, i32 %1) {
-; SDISEL-LABEL: cmp_and_negative_const:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmn w0, #1
-; SDISEL-NEXT:    ccmn w1, #2, #0, eq
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_and_negative_const:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmn w0, #1
+; CHECK-SD-NEXT:    ccmn w1, #2, #0, eq
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_and_negative_const:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmn w0, #1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmn w1, #2
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_and_negative_const:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    cmn w0, #1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmn w1, #2
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %3 = icmp eq i32 %0, -1
   %4 = icmp eq i32 %1, -2
   %5 = and i1 %3, %4
@@ -1190,21 +1190,21 @@ define i1 @cmp_and_negative_const(i32 %0, i32 %1) {
 }
 
 define i1 @cmp_or_negative_const(i32 %a, i32 %b) {
-; SDISEL-LABEL: cmp_or_negative_const:
-; SDISEL:       ; %bb.0:
-; SDISEL-NEXT:    cmn w0, #1
-; SDISEL-NEXT:    ccmn w1, #2, #4, ne
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_or_negative_const:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmn w0, #1
+; CHECK-SD-NEXT:    ccmn w1, #2, #4, ne
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_or_negative_const:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmn w0, #1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    cmn w1, #2
-; GISEL-NEXT:    cset w9, eq
-; GISEL-NEXT:    orr w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_or_negative_const:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    cmn w0, #1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    cmn w1, #2
+; CHECK-GI-NEXT:    cset w9, eq
+; CHECK-GI-NEXT:    orr w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %cmp = icmp eq i32 %a, -1
   %cmp1 = icmp eq i32 %b, -2
   %or.cond = or i1 %cmp, %cmp1
diff --git a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
index ce35810..60c48bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -enable-unsafe-fp-math -mattr=+fullfp16 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -mattr=+fullfp16 | FileCheck %s
 ; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -fp-contract=fast -mattr=+fullfp16 | FileCheck %s
 
 define void @foo_2d(ptr %src) {
@@ -130,9 +130,9 @@ for.end:                                          ; preds = %for.body
 ; CHECK: fnmadd h0, h0, h1, h2
 define half @test0(half %a, half %b, half %c) {
 entry:
-  %0 = fmul half %a, %b
-  %mul = fsub half -0.000000e+00, %0
-  %sub1 = fsub half %mul, %c
+  %0 = fmul contract half %a, %b
+  %mul = fsub contract half -0.000000e+00, %0
+  %sub1 = fsub contract half %mul, %c
   ret half %sub1
 }
 
@@ -140,9 +140,9 @@ entry:
 ; CHECK: fnmadd s0, s0, s1, s2
 define float @test1(float %a, float %b, float %c) {
 entry:
-  %0 = fmul float %a, %b
-  %mul = fsub float -0.000000e+00, %0
-  %sub1 = fsub float %mul, %c
+  %0 = fmul contract float %a, %b
+  %mul = fsub contract float -0.000000e+00, %0
+  %sub1 = fsub contract float %mul, %c
   ret float %sub1
 }
 
@@ -150,9 +150,9 @@ entry:
 ; CHECK: fnmadd d0, d0, d1, d2
 define double @test2(double %a, double %b, double %c) {
 entry:
-  %0 = fmul double %a, %b
-  %mul = fsub double -0.000000e+00, %0
-  %sub1 = fsub double %mul, %c
+  %0 = fmul contract double %a, %b
+  %mul = fsub contract double -0.000000e+00, %0
+  %sub1 = fsub contract double %mul, %c
   ret double %sub1
 }
 
diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll
index 4b816df..3620444 100644
--- a/llvm/test/CodeGen/AArch64/cmp-chains.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll
@@ -1,26 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -mtriple=aarch64-- -global-isel | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64-- -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; Ensure chains of comparisons produce chains of `ccmp`
 
 ; (x0 < x1) && (x2 > x3)
 define i32 @cmp_and2(i32 %0, i32 %1, i32 %2, i32 %3) {
-; SDISEL-LABEL: cmp_and2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lo
-; SDISEL-NEXT:    cset w0, hi
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_and2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lo
+; CHECK-SD-NEXT:    cset w0, hi
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_and2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_and2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %5 = icmp ult i32 %0, %1
   %6 = icmp ugt i32 %2, %3
   %7 = select i1 %5, i1 %6, i1 false
@@ -30,25 +30,25 @@ define i32 @cmp_and2(i32 %0, i32 %1, i32 %2, i32 %3) {
 
 ; (x0 < x1) && (x2 > x3) && (x4 != x5)
 define i32 @cmp_and3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
-; SDISEL-LABEL: cmp_and3:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, lo
-; SDISEL-NEXT:    ccmp w4, w5, #4, hi
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_and3:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, lo
+; CHECK-SD-NEXT:    ccmp w4, w5, #4, hi
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_and3:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_and3:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %7 = icmp ult i32 %0, %1
   %8 = icmp ugt i32 %2, %3
   %9 = select i1 %7, i1 %8, i1 false
@@ -60,29 +60,29 @@ define i32 @cmp_and3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
 
 ; (x0 < x1) && (x2 > x3) && (x4 != x5) && (x6 == x7)
 define i32 @cmp_and4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) {
-; SDISEL-LABEL: cmp_and4:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w2, w3
-; SDISEL-NEXT:    ccmp w0, w1, #2, hi
-; SDISEL-NEXT:    ccmp w4, w5, #4, lo
-; SDISEL-NEXT:    ccmp w6, w7, #0, ne
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_and4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w2, w3
+; CHECK-SD-NEXT:    ccmp w0, w1, #2, hi
+; CHECK-SD-NEXT:    ccmp w4, w5, #4, lo
+; CHECK-SD-NEXT:    ccmp w6, w7, #0, ne
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_and4:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w8, hi
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    cset w10, ne
-; GISEL-NEXT:    cmp w6, w7
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_and4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    cset w10, ne
+; CHECK-GI-NEXT:    cmp w6, w7
+; CHECK-GI-NEXT:    and w8, w8, w9
+; CHECK-GI-NEXT:    cset w11, eq
+; CHECK-GI-NEXT:    and w9, w10, w11
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
   %9 = icmp ugt i32 %2, %3
   %10 = icmp ult i32 %0, %1
   %11 = select i1 %9, i1 %10, i1 false
@@ -96,22 +96,22 @@ define i32 @cmp_and4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32
 
 ; (x0 < x1) || (x2 > x3)
 define i32 @cmp_or2(i32 %0, i32 %1, i32 %2, i32 %3) {
-; SDISEL-LABEL: cmp_or2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #0, hs
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_or2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #0, hs
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_or2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    and w0, w8, #0x1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_or2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
   %5 = icmp ult i32 %0, %1
   %6 = icmp ne i32 %2, %3
   %7 = select i1 %5, i1 true, i1 %6
@@ -121,26 +121,26 @@ define i32 @cmp_or2(i32 %0, i32 %1, i32 %2, i32 %3) {
 
 ; (x0 < x1) || (x2 > x3) || (x4 != x5)
 define i32 @cmp_or3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
-; SDISEL-LABEL: cmp_or3:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, hs
-; SDISEL-NEXT:    ccmp w4, w5, #0, ls
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_or3:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, hs
+; CHECK-SD-NEXT:    ccmp w4, w5, #0, ls
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_or3:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    and w0, w8, #0x1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_or3:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
   %7 = icmp ult i32 %0, %1
   %8 = icmp ugt i32 %2, %3
   %9 = select i1 %7, i1 true, i1 %8
@@ -152,30 +152,30 @@ define i32 @cmp_or3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
 
 ; (x0 < x1) || (x2 > x3) || (x4 != x5) || (x6 == x7)
 define i32 @cmp_or4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) {
-; SDISEL-LABEL: cmp_or4:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, w1
-; SDISEL-NEXT:    ccmp w2, w3, #2, hs
-; SDISEL-NEXT:    ccmp w4, w5, #0, ls
-; SDISEL-NEXT:    ccmp w6, w7, #4, eq
-; SDISEL-NEXT:    cset w0, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: cmp_or4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    ccmp w2, w3, #2, hs
+; CHECK-SD-NEXT:    ccmp w4, w5, #0, ls
+; CHECK-SD-NEXT:    ccmp w6, w7, #4, eq
+; CHECK-SD-NEXT:    cset w0, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: cmp_or4:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, w1
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w3
-; GISEL-NEXT:    cset w9, hi
-; GISEL-NEXT:    cmp w4, w5
-; GISEL-NEXT:    cset w10, ne
-; GISEL-NEXT:    cmp w6, w7
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    cset w11, eq
-; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    and w0, w8, #0x1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: cmp_or4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w3
+; CHECK-GI-NEXT:    cset w9, hi
+; CHECK-GI-NEXT:    cmp w4, w5
+; CHECK-GI-NEXT:    cset w10, ne
+; CHECK-GI-NEXT:    cmp w6, w7
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    cset w11, eq
+; CHECK-GI-NEXT:    orr w9, w10, w11
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
   %9 = icmp ult i32 %0, %1
   %10 = icmp ugt i32 %2, %3
   %11 = select i1 %9, i1 true, i1 %10
@@ -189,22 +189,22 @@ define i32 @cmp_or4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32
 
 ; (x0 != 0) || (x1 != 0)
 define i32 @true_or2(i32 %0, i32 %1) {
-; SDISEL-LABEL: true_or2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w0, w1
-; SDISEL-NEXT:    cmp w8, #0
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: true_or2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w0, w1
+; CHECK-SD-NEXT:    cmp w8, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: true_or2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    and w0, w8, #0x1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: true_or2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w1, #0
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
   %3 = icmp ne i32 %0, 0
   %4 = icmp ne i32 %1, 0
   %5 = select i1 %3, i1 true, i1 %4
@@ -214,26 +214,26 @@ define i32 @true_or2(i32 %0, i32 %1) {
 
 ; (x0 != 0) || (x1 != 0) || (x2 != 0)
 define i32 @true_or3(i32 %0, i32 %1, i32 %2) {
-; SDISEL-LABEL: true_or3:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w0, w1
-; SDISEL-NEXT:    orr w8, w8, w2
-; SDISEL-NEXT:    cmp w8, #0
-; SDISEL-NEXT:    cset w0, ne
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: true_or3:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w0, w1
+; CHECK-SD-NEXT:    orr w8, w8, w2
+; CHECK-SD-NEXT:    cmp w8, #0
+; CHECK-SD-NEXT:    cset w0, ne
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: true_or3:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    cmp w2, #0
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    and w0, w8, #0x1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: true_or3:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w8, ne
+; CHECK-GI-NEXT:    cmp w1, #0
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    cmp w2, #0
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    cset w9, ne
+; CHECK-GI-NEXT:    orr w8, w8, w9
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
   %4 = icmp ne i32 %0, 0
   %5 = icmp ne i32 %1, 0
   %6 = select i1 %4, i1 true, i1 %5
@@ -260,22 +260,22 @@ define i32 @neg_range_int(i32 %a, i32 %b, i32 %c) {
 
 ; (b > -(d | 1) && a < c)
 define i32 @neg_range_int_comp(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, lt
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_comp:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w3, #0x1
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, w8, #4, lt
+; CHECK-SD-NEXT:    csel w0, w1, w0, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_comp:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, lt
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_comp:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr w8, w3, #0x1
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    ccmp w1, w8, #4, lt
+; CHECK-GI-NEXT:    csel w0, w1, w0, gt
+; CHECK-GI-NEXT:    ret
   %dor = or i32 %d, 1
   %negd = sub i32 0, %dor
   %cmp = icmp sgt i32 %b, %negd
@@ -287,22 +287,22 @@ define i32 @neg_range_int_comp(i32 %a, i32 %b, i32 %c, i32 %d) {
 
 ; (b >u -(d | 1) && a < c)
 define i32 @neg_range_int_comp_u(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_u:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #0, lt
-; SDISEL-NEXT:    csel w0, w1, w0, hi
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_comp_u:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w3, #0x1
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, w8, #0, lt
+; CHECK-SD-NEXT:    csel w0, w1, w0, hi
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_comp_u:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #0, lt
-; GISEL-NEXT:    csel w0, w1, w0, hi
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_comp_u:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr w8, w3, #0x1
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    ccmp w1, w8, #0, lt
+; CHECK-GI-NEXT:    csel w0, w1, w0, hi
+; CHECK-GI-NEXT:    ret
   %dor = or i32 %d, 1
   %negd = sub i32 0, %dor
   %cmp = icmp ugt i32 %b, %negd
@@ -314,22 +314,22 @@ define i32 @neg_range_int_comp_u(i32 %a, i32 %b, i32 %c, i32 %d) {
 
 ; (b > -(d | 1) && a u < c)
 define i32 @neg_range_int_comp_ua(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_ua:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, lo
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_comp_ua:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w3, #0x1
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, w8, #4, lo
+; CHECK-SD-NEXT:    csel w0, w1, w0, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_comp_ua:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, lo
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_comp_ua:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr w8, w3, #0x1
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    ccmp w1, w8, #4, lo
+; CHECK-GI-NEXT:    csel w0, w1, w0, gt
+; CHECK-GI-NEXT:    ret
   %dor = or i32 %d, 1
   %negd = sub i32 0, %dor
   %cmp = icmp sgt i32 %b, %negd
@@ -341,19 +341,19 @@ define i32 @neg_range_int_comp_ua(i32 %a, i32 %b, i32 %c, i32 %d) {
 
 ; (b <= -3 && a > c)
 define i32 @neg_range_int_2(i32 %a, i32 %b, i32 %c) {
-; SDISEL-LABEL: neg_range_int_2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, #4, #4, gt
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, #4, #4, gt
+; CHECK-SD-NEXT:    csel w0, w1, w0, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    ccmn w1, #3, #8, gt
-; GISEL-NEXT:    csel w0, w1, w0, ge
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    ccmn w1, #3, #8, gt
+; CHECK-GI-NEXT:    csel w0, w1, w0, ge
+; CHECK-GI-NEXT:    ret
   %cmp = icmp sge i32 %b, -3
   %cmp1 = icmp sgt i32 %a, %c
   %or.cond = and i1 %cmp, %cmp1
@@ -363,22 +363,22 @@ define i32 @neg_range_int_2(i32 %a, i32 %b, i32 %c) {
 
 ; (b < -(d | 1) && a >= c)
 define i32 @neg_range_int_comp2(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #0, ge
-; SDISEL-NEXT:    csel w0, w1, w0, lt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_comp2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w3, #0x1
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, w8, #0, ge
+; CHECK-SD-NEXT:    csel w0, w1, w0, lt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_comp2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #0, ge
-; GISEL-NEXT:    csel w0, w1, w0, lt
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_comp2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr w8, w3, #0x1
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    ccmp w1, w8, #0, ge
+; CHECK-GI-NEXT:    csel w0, w1, w0, lt
+; CHECK-GI-NEXT:    ret
   %dor = or i32 %d, 1
   %negd = sub i32 0, %dor
   %cmp = icmp slt i32 %b, %negd
@@ -390,22 +390,22 @@ define i32 @neg_range_int_comp2(i32 %a, i32 %b, i32 %c, i32 %d) {
 
 ; (b <u -(d | 1) && a > c)
 define i32 @neg_range_int_comp_u2(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_u2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #2, gt
-; SDISEL-NEXT:    csel w0, w1, w0, lo
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_comp_u2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w3, #0x1
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, w8, #2, gt
+; CHECK-SD-NEXT:    csel w0, w1, w0, lo
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_comp_u2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #2, gt
-; GISEL-NEXT:    csel w0, w1, w0, lo
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_comp_u2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr w8, w3, #0x1
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    ccmp w1, w8, #2, gt
+; CHECK-GI-NEXT:    csel w0, w1, w0, lo
+; CHECK-GI-NEXT:    ret
   %dor = or i32 %d, 1
   %negd = sub i32 0, %dor
   %cmp = icmp ult i32 %b, %negd
@@ -417,22 +417,22 @@ define i32 @neg_range_int_comp_u2(i32 %a, i32 %b, i32 %c, i32 %d) {
 
 ; (b > -(d | 1) && a u > c)
 define i32 @neg_range_int_comp_ua2(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_ua2:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, hi
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_comp_ua2:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w3, #0x1
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, w8, #4, hi
+; CHECK-SD-NEXT:    csel w0, w1, w0, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_comp_ua2:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, hi
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_comp_ua2:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr w8, w3, #0x1
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    ccmp w1, w8, #4, hi
+; CHECK-GI-NEXT:    csel w0, w1, w0, gt
+; CHECK-GI-NEXT:    ret
   %dor = or i32 %d, 1
   %negd = sub i32 0, %dor
   %cmp = icmp sgt i32 %b, %negd
@@ -444,22 +444,22 @@ define i32 @neg_range_int_comp_ua2(i32 %a, i32 %b, i32 %c, i32 %d) {
 
 ; (b > -(d | 1) && a u == c)
 define i32 @neg_range_int_comp_ua3(i32 %a, i32 %b, i32 %c, i32 %d) {
-; SDISEL-LABEL: neg_range_int_comp_ua3:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    orr w8, w3, #0x1
-; SDISEL-NEXT:    cmp w0, w2
-; SDISEL-NEXT:    ccmn w1, w8, #4, eq
-; SDISEL-NEXT:    csel w0, w1, w0, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_comp_ua3:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    orr w8, w3, #0x1
+; CHECK-SD-NEXT:    cmp w0, w2
+; CHECK-SD-NEXT:    ccmn w1, w8, #4, eq
+; CHECK-SD-NEXT:    csel w0, w1, w0, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_comp_ua3:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    orr w8, w3, #0x1
-; GISEL-NEXT:    cmp w0, w2
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    ccmp w1, w8, #4, eq
-; GISEL-NEXT:    csel w0, w1, w0, gt
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_comp_ua3:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    orr w8, w3, #0x1
+; CHECK-GI-NEXT:    cmp w0, w2
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    ccmp w1, w8, #4, eq
+; CHECK-GI-NEXT:    csel w0, w1, w0, gt
+; CHECK-GI-NEXT:    ret
   %dor = or i32 %d, 1
   %negd = sub i32 0, %dor
   %cmp = icmp sgt i32 %b, %negd
@@ -471,26 +471,26 @@ define i32 @neg_range_int_comp_ua3(i32 %a, i32 %b, i32 %c, i32 %d) {
 
 ; -(a | 1) > (b | 3) && a < c
 define i32 @neg_range_int_c(i32 %a, i32 %b, i32 %c) {
-; SDISEL-LABEL: neg_range_int_c:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    orr w8, w0, #0x1
-; SDISEL-NEXT:    orr w9, w1, #0x3
-; SDISEL-NEXT:    cmn w9, w8
-; SDISEL-NEXT:    ccmp w2, w0, #2, lo
-; SDISEL-NEXT:    cset w0, lo
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: neg_range_int_c:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    orr w8, w0, #0x1
+; CHECK-SD-NEXT:    orr w9, w1, #0x3
+; CHECK-SD-NEXT:    cmn w9, w8
+; CHECK-SD-NEXT:    ccmp w2, w0, #2, lo
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: neg_range_int_c:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    orr w8, w0, #0x1
-; GISEL-NEXT:    orr w9, w1, #0x3
-; GISEL-NEXT:    neg w8, w8
-; GISEL-NEXT:    cmp w9, w8
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    cmp w2, w0
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    and w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: neg_range_int_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    orr w8, w0, #0x1
+; CHECK-GI-NEXT:    orr w9, w1, #0x3
+; CHECK-GI-NEXT:    neg w8, w8
+; CHECK-GI-NEXT:    cmp w9, w8
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    cmp w2, w0
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    and w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %or = or i32 %a, 1
   %sub = sub i32 0, %or
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index 13434fa..7686740 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -203,93 +203,89 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c)
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-NEXT:    ldr s17, [sp, #40]
+; CHECK-NEXT:    add x10, sp, #56
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
-; CHECK-NEXT:    ldr s17, [sp, #32]
-; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-NEXT:    add x9, sp, #48
-; CHECK-NEXT:    add x10, sp, #64
 ; CHECK-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-NEXT:    ldr s3, [sp, #32]
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
+; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
+; CHECK-NEXT:    ldr s16, [sp, #8]
 ; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-NEXT:    add x11, sp, #72
-; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
-; CHECK-NEXT:    ldr s18, [x10]
-; CHECK-NEXT:    add x9, sp, #80
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #72
 ; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
-; CHECK-NEXT:    ldr s16, [sp, #8]
-; CHECK-NEXT:    ldr s3, [sp, #96]
-; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #88
+; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-NEXT:    ldr s2, [sp]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #112
+; CHECK-NEXT:    ldr s20, [sp, #136]
 ; CHECK-NEXT:    mov v1.s[2], v5.s[0]
-; CHECK-NEXT:    ldr s5, [sp, #40]
+; CHECK-NEXT:    ld1 { v17.s }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ldr s5, [sp, #96]
+; CHECK-NEXT:    ld1 { v3.s }[2], [x9]
 ; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    add x9, sp, #88
+; CHECK-NEXT:    ldr s4, [sp, #104]
+; CHECK-NEXT:    ldr s19, [sp, #192]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    ldr s19, [x11]
+; CHECK-NEXT:    add x10, sp, #80
+; CHECK-NEXT:    ld1 { v17.s }[3], [x9]
+; CHECK-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-NEXT:    add x9, sp, #120
+; CHECK-NEXT:    ld1 { v3.s }[3], [x10]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-NEXT:    ldr s7, [sp, #128]
 ; CHECK-NEXT:    add x10, sp, #144
-; CHECK-NEXT:    zip1 v4.2d, v17.2d, v18.2d
-; CHECK-NEXT:    add x11, sp, #160
-; CHECK-NEXT:    ldr s18, [sp, #136]
-; CHECK-NEXT:    ld1 { v19.s }[1], [x9]
 ; CHECK-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-NEXT:    ldr s6, [sp, #128]
-; CHECK-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    ldr s7, [sp, #104]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
-; CHECK-NEXT:    zip1 v5.2d, v5.2d, v19.2d
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #16
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ldr s17, [x11]
-; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    add x10, sp, #16
-; CHECK-NEXT:    add x11, sp, #168
-; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    fmul v19.4s, v5.4s, v1.4s
-; CHECK-NEXT:    fmul v20.4s, v7.4s, v16.4s
-; CHECK-NEXT:    fmul v16.4s, v3.4s, v16.4s
-; CHECK-NEXT:    fmul v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
-; CHECK-NEXT:    ldr s21, [x11]
-; CHECK-NEXT:    zip1 v6.2d, v6.2d, v17.2d
-; CHECK-NEXT:    ldr s17, [sp, #192]
-; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    fmul v6.4s, v17.4s, v1.4s
+; CHECK-NEXT:    fmul v18.4s, v4.4s, v16.4s
+; CHECK-NEXT:    fmul v16.4s, v5.4s, v16.4s
+; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    add x10, sp, #208
-; CHECK-NEXT:    ld1 { v21.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v7.s }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    ld1 { v19.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v20.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #176
+; CHECK-NEXT:    add x10, sp, #184
+; CHECK-NEXT:    fneg v6.4s, v6.4s
+; CHECK-NEXT:    fneg v18.4s, v18.4s
+; CHECK-NEXT:    fmla v16.4s, v2.4s, v4.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v17.4s
+; CHECK-NEXT:    ld1 { v7.s }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #168
+; CHECK-NEXT:    ld1 { v20.s }[2], [x9]
+; CHECK-NEXT:    ldr s4, [sp, #200]
 ; CHECK-NEXT:    add x9, sp, #216
-; CHECK-NEXT:    fneg v19.4s, v19.4s
-; CHECK-NEXT:    fneg v20.4s, v20.4s
-; CHECK-NEXT:    fmla v16.4s, v2.4s, v7.4s
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v5.4s
-; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
-; CHECK-NEXT:    ldr s5, [sp, #200]
-; CHECK-NEXT:    zip1 v7.2d, v18.2d, v21.2d
-; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
-; CHECK-NEXT:    fmla v19.4s, v0.4s, v4.4s
-; CHECK-NEXT:    fmla v20.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fsub v0.4s, v6.4s, v1.4s
-; CHECK-NEXT:    fsub v1.4s, v17.4s, v16.4s
-; CHECK-NEXT:    fadd v2.4s, v7.4s, v19.4s
-; CHECK-NEXT:    fadd v3.4s, v5.4s, v20.4s
+; CHECK-NEXT:    fmla v6.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmla v18.4s, v2.4s, v5.4s
+; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-NEXT:    fsub v0.4s, v7.4s, v1.4s
+; CHECK-NEXT:    fsub v1.4s, v19.4s, v16.4s
+; CHECK-NEXT:    ld1 { v20.s }[3], [x10]
+; CHECK-NEXT:    fadd v2.4s, v4.4s, v18.4s
+; CHECK-NEXT:    fadd v3.4s, v20.4s, v6.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
-; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ext v5.16b, v3.16b, v2.16b, #12
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
+; CHECK-NEXT:    ext v5.16b, v3.16b, v5.16b, #8
 ; CHECK-NEXT:    rev64 v4.4s, v4.4s
-; CHECK-NEXT:    trn2 v3.4s, v4.4s, v5.4s
-; CHECK-NEXT:    zip2 v4.4s, v0.4s, v2.4s
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ext v1.16b, v3.16b, v1.16b, #8
-; CHECK-NEXT:    mov v4.d[1], v3.d[0]
+; CHECK-NEXT:    trn2 v2.4s, v4.4s, v5.4s
+; CHECK-NEXT:    zip2 v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #8
+; CHECK-NEXT:    mov v4.d[1], v2.d[0]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    stp q4, q1, [x8, #16]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index e6f27b9..acf15f1 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -186,9 +186,8 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-NEXT:    ldr s1, [x2]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
-; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
+; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
 ; CHECK-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-select.ll b/llvm/test/CodeGen/AArch64/dag-combine-select.ll
index 56208f1..02b0077 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-select.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-select.ll
@@ -1,26 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple arm64-none-eabi -o - %s | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc -mtriple arm64-none-eabi -global-isel -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple arm64-none-eabi -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple arm64-none-eabi -global-isel -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 @out = internal global i32 0, align 4
 
 ; Ensure that we transform select(C0, x, select(C1, x, y)) towards
 ; select(C0 | C1, x, y) so we can use CMP;CCMP for the implementation.
 define i32 @test0(i32 %v0, i32 %v1, i32 %v2) {
-; SDISEL-LABEL: test0:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, #7
-; SDISEL-NEXT:    ccmp w1, #0, #0, ne
-; SDISEL-NEXT:    csel w0, w1, w2, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: test0:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, #7
+; CHECK-SD-NEXT:    ccmp w1, #0, #0, ne
+; CHECK-SD-NEXT:    csel w0, w1, w2, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test0:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, #7
-; GISEL-NEXT:    csel w8, w1, w2, eq
-; GISEL-NEXT:    cmp w1, #0
-; GISEL-NEXT:    csel w0, w1, w8, gt
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test0:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, #7
+; CHECK-GI-NEXT:    csel w8, w1, w2, eq
+; CHECK-GI-NEXT:    cmp w1, #0
+; CHECK-GI-NEXT:    csel w0, w1, w8, gt
+; CHECK-GI-NEXT:    ret
   %cmp1 = icmp eq i32 %v0, 7
   %cmp2 = icmp sgt i32 %v1, 0
   %sel0 = select i1 %cmp1, i32 %v1, i32 %v2
@@ -32,36 +32,36 @@ define i32 @test0(i32 %v0, i32 %v1, i32 %v2) {
 ; sequences. This case should be transformed to select(C0, select(C1, x, y), y)
 ; anyway to get CSE effects.
 define void @test1(i32 %bitset, i32 %val0, i32 %val1) {
-; SDISEL-LABEL: test1:
-; SDISEL:       // %bb.0:
-; SDISEL-NEXT:    cmp w0, #7
-; SDISEL-NEXT:    adrp x9, out
-; SDISEL-NEXT:    csel w8, w1, w2, eq
-; SDISEL-NEXT:    cmp w8, #13
-; SDISEL-NEXT:    csel w8, w1, w2, lo
-; SDISEL-NEXT:    cmp w0, #42
-; SDISEL-NEXT:    csel w10, w1, w8, eq
-; SDISEL-NEXT:    str w8, [x9, :lo12:out]
-; SDISEL-NEXT:    str w10, [x9, :lo12:out]
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: test1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, #7
+; CHECK-SD-NEXT:    adrp x9, out
+; CHECK-SD-NEXT:    csel w8, w1, w2, eq
+; CHECK-SD-NEXT:    cmp w8, #13
+; CHECK-SD-NEXT:    csel w8, w1, w2, lo
+; CHECK-SD-NEXT:    cmp w0, #42
+; CHECK-SD-NEXT:    csel w10, w1, w8, eq
+; CHECK-SD-NEXT:    str w8, [x9, :lo12:out]
+; CHECK-SD-NEXT:    str w10, [x9, :lo12:out]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test1:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    cmp w0, #7
-; GISEL-NEXT:    csel w8, w1, w2, eq
-; GISEL-NEXT:    cmp w8, #13
-; GISEL-NEXT:    cset w8, lo
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w9, w1, w2, ne
-; GISEL-NEXT:    cmp w0, #42
-; GISEL-NEXT:    cset w10, eq
-; GISEL-NEXT:    orr w8, w10, w8
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    adrp x8, out
-; GISEL-NEXT:    csel w10, w1, w2, ne
-; GISEL-NEXT:    str w9, [x8, :lo12:out]
-; GISEL-NEXT:    str w10, [x8, :lo12:out]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, #7
+; CHECK-GI-NEXT:    csel w8, w1, w2, eq
+; CHECK-GI-NEXT:    cmp w8, #13
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    csel w9, w1, w2, ne
+; CHECK-GI-NEXT:    cmp w0, #42
+; CHECK-GI-NEXT:    cset w10, eq
+; CHECK-GI-NEXT:    orr w8, w10, w8
+; CHECK-GI-NEXT:    tst w8, #0x1
+; CHECK-GI-NEXT:    adrp x8, out
+; CHECK-GI-NEXT:    csel w10, w1, w2, ne
+; CHECK-GI-NEXT:    str w9, [x8, :lo12:out]
+; CHECK-GI-NEXT:    str w10, [x8, :lo12:out]
+; CHECK-GI-NEXT:    ret
   %cmp1 = icmp eq i32 %bitset, 7
   %cond = select i1 %cmp1, i32 %val0, i32 %val1
   %cmp5 = icmp ult i32 %cond, 13
diff --git a/llvm/test/CodeGen/AArch64/fcsel-zero.ll b/llvm/test/CodeGen/AArch64/fcsel-zero.ll
index 3fbcd10..3db588b 100644
--- a/llvm/test/CodeGen/AArch64/fcsel-zero.ll
+++ b/llvm/test/CodeGen/AArch64/fcsel-zero.ll
@@ -2,8 +2,8 @@
 
 ; RUN: llc -mtriple=aarch64-linux-gnu -o - < %s | FileCheck %s
 
-define float @foeq(float %a, float %b) #0 {
-  %t = fcmp oeq float %a, 0.0
+define float @foeq(float %a, float %b) {
+  %t = fcmp nsz oeq float %a, 0.0
   %v = select i1 %t, float 0.0, float %b
   ret float %v
 ; CHECK-LABEL: foeq
@@ -11,8 +11,8 @@ define float @foeq(float %a, float %b) #0 {
 ; CHECK-NEXT: fcsel {{s[0-9]+}}, [[R]], {{s[0-9]+}}, eq
 }
 
-define float @fueq(float %a, float %b) #0 {
-  %t = fcmp ueq float %a, 0.0
+define float @fueq(float %a, float %b) {
+  %t = fcmp nsz ueq float %a, 0.0
   %v = select i1 %t, float 0.0, float %b
   ret float %v
 ; CHECK-LABEL: fueq
@@ -21,8 +21,8 @@ define float @fueq(float %a, float %b) #0 {
 ; CHECK-NEXT: fcsel {{s[0-9]+}}, [[R]], {{s[0-9]+}}, vs
 }
 
-define float @fone(float %a, float %b) #0 {
-  %t = fcmp one float %a, 0.0
+define float @fone(float %a, float %b) {
+  %t = fcmp nsz one float %a, 0.0
   %v = select i1 %t, float %b, float 0.0
   ret float %v
 ; CHECK-LABEL: fone
@@ -31,8 +31,8 @@ define float @fone(float %a, float %b) #0 {
 ; CHECK-NEXT: fcsel {{s[0-9]+}}, {{s[0-9]+}}, [[R]], gt
 }
 
-define float @fune(float %a, float %b) #0 {
-  %t = fcmp une float %a, 0.0
+define float @fune(float %a, float %b) {
+  %t = fcmp nsz une float %a, 0.0
   %v = select i1 %t, float %b, float 0.0
   ret float %v
 ; CHECK-LABEL: fune
@@ -40,8 +40,8 @@ define float @fune(float %a, float %b) #0 {
 ; CHECK-NEXT: fcsel {{s[0-9]+}}, {{s[0-9]+}}, [[R]], ne
 }
 
-define double @doeq(double %a, double %b) #0 {
-  %t = fcmp oeq double %a, 0.0
+define double @doeq(double %a, double %b) {
+  %t = fcmp nsz oeq double %a, 0.0
   %v = select i1 %t, double 0.0, double %b
   ret double %v
 ; CHECK-LABEL: doeq
@@ -49,8 +49,8 @@ define double @doeq(double %a, double %b) #0 {
 ; CHECK-NEXT: fcsel {{d[0-9]+}}, [[R]], {{d[0-9]+}}, eq
 }
 
-define double @dueq(double %a, double %b) #0 {
-  %t = fcmp ueq double %a, 0.0
+define double @dueq(double %a, double %b) {
+  %t = fcmp nsz ueq double %a, 0.0
   %v = select i1 %t, double 0.0, double %b
   ret double %v
 ; CHECK-LABEL: dueq
@@ -59,8 +59,8 @@ define double @dueq(double %a, double %b) #0 {
 ; CHECK-NEXT: fcsel {{d[0-9]+}}, [[R]], {{d[0-9]+}}, vs
 }
 
-define double @done(double %a, double %b) #0 {
-  %t = fcmp one double %a, 0.0
+define double @done(double %a, double %b) {
+  %t = fcmp nsz one double %a, 0.0
   %v = select i1 %t, double %b, double 0.0
   ret double %v
 ; CHECK-LABEL: done
@@ -69,14 +69,11 @@ define double @done(double %a, double %b) #0 {
 ; CHECK-NEXT: fcsel {{d[0-9]+}}, {{d[0-9]+}}, [[R]], gt
 }
 
-define double @dune(double %a, double %b) #0 {
-  %t = fcmp une double %a, 0.0
+define double @dune(double %a, double %b) {
+  %t = fcmp nsz une double %a, 0.0
   %v = select i1 %t, double %b, double 0.0
   ret double %v
 ; CHECK-LABEL: dune
 ; CHECK: fcmp [[R:d[0-9]+]], #0.0
 ; CHECK-NEXT: fcsel {{d[0-9]+}}, {{d[0-9]+}}, [[R]], ne
 }
-
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
-
diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index 4906e2e..c6b8e41 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -1431,7 +1431,6 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
-; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -1440,30 +1439,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    ldr h2, [x10]
-; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    add x9, sp, #40
 ; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
+; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    add x9, sp, #48
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
-; FULLFP16-NEXT:    add x9, sp, #64
+; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
+; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
-; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
-; FULLFP16-NEXT:    ldr h2, [sp]
+; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    add x9, sp, #64
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
+; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
-; FULLFP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
@@ -2013,7 +2012,6 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    add x9, sp, #16
 ; FULLFP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; FULLFP16-NEXT:    // kill: def $h4 killed $h4 def $q4
-; FULLFP16-NEXT:    add x10, sp, #40
 ; FULLFP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; FULLFP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; FULLFP16-NEXT:    // kill: def $h7 killed $h7 def $q7
@@ -2022,30 +2020,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; FULLFP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #24
 ; FULLFP16-NEXT:    mov v0.h[2], v2.h[0]
+; FULLFP16-NEXT:    ldr h2, [sp]
 ; FULLFP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; FULLFP16-NEXT:    add x9, sp, #32
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; FULLFP16-NEXT:    mov v0.h[3], v3.h[0]
 ; FULLFP16-NEXT:    ld1 { v1.h }[3], [x9]
-; FULLFP16-NEXT:    ldr h2, [x10]
-; FULLFP16-NEXT:    add x9, sp, #48
+; FULLFP16-NEXT:    add x9, sp, #40
 ; FULLFP16-NEXT:    ldr h3, [sp, #72]
-; FULLFP16-NEXT:    ld1 { v2.h }[1], [x9]
-; FULLFP16-NEXT:    add x9, sp, #56
+; FULLFP16-NEXT:    ld1 { v1.h }[4], [x9]
+; FULLFP16-NEXT:    add x9, sp, #48
 ; FULLFP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[4], v4.h[0]
-; FULLFP16-NEXT:    ld1 { v2.h }[2], [x9]
-; FULLFP16-NEXT:    add x9, sp, #64
+; FULLFP16-NEXT:    ld1 { v1.h }[5], [x9]
+; FULLFP16-NEXT:    add x9, sp, #56
+; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    mov v0.h[5], v5.h[0]
-; FULLFP16-NEXT:    ld1 { v2.h }[3], [x9]
-; FULLFP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
-; FULLFP16-NEXT:    ldr h2, [sp]
+; FULLFP16-NEXT:    ld1 { v1.h }[6], [x9]
+; FULLFP16-NEXT:    add x9, sp, #64
+; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    mov v0.h[6], v6.h[0]
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
+; FULLFP16-NEXT:    ld1 { v1.h }[7], [x9]
 ; FULLFP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
 ; FULLFP16-NEXT:    mov v0.h[7], v7.h[0]
-; FULLFP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    str h2, [x8, #16]
 ; FULLFP16-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; FULLFP16-NEXT:    str q0, [x8]
 ; FULLFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
index 1b98954..b056460 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_1op.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half)
 declare i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half)
@@ -27,18 +27,18 @@ declare half @llvm.aarch64.neon.frecpx.f16(half)
 declare half @llvm.aarch64.neon.frecpe.f16(half)
 
 define dso_local i16 @t2(half %a) {
-; SDISEL-LABEL: t2:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, #0.0
-; SDISEL-NEXT:    csetm w0, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t2:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, #0.0
+; CHECK-SD-NEXT:    csetm w0, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t2:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, #0.0
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t2:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, #0.0
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp oeq half %a, 0xH0000
   %vceqz = sext i1 %0 to i16
@@ -46,18 +46,18 @@ entry:
 }
 
 define dso_local i16 @t3(half %a) {
-; SDISEL-LABEL: t3:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, #0.0
-; SDISEL-NEXT:    csetm w0, ge
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t3:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, #0.0
+; CHECK-SD-NEXT:    csetm w0, ge
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t3:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, #0.0
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t3:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, #0.0
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp oge half %a, 0xH0000
   %vcgez = sext i1 %0 to i16
@@ -65,18 +65,18 @@ entry:
 }
 
 define dso_local i16 @t4(half %a) {
-; SDISEL-LABEL: t4:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, #0.0
-; SDISEL-NEXT:    csetm w0, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t4:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, #0.0
+; CHECK-SD-NEXT:    csetm w0, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t4:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, #0.0
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t4:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, #0.0
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp ogt half %a, 0xH0000
   %vcgtz = sext i1 %0 to i16
@@ -84,18 +84,18 @@ entry:
 }
 
 define dso_local i16 @t5(half %a) {
-; SDISEL-LABEL: t5:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, #0.0
-; SDISEL-NEXT:    csetm w0, ls
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t5:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, #0.0
+; CHECK-SD-NEXT:    csetm w0, ls
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t5:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, #0.0
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t5:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, #0.0
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp ole half %a, 0xH0000
   %vclez = sext i1 %0 to i16
@@ -103,18 +103,18 @@ entry:
 }
 
 define dso_local i16 @t6(half %a) {
-; SDISEL-LABEL: t6:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, #0.0
-; SDISEL-NEXT:    csetm w0, mi
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t6:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, #0.0
+; CHECK-SD-NEXT:    csetm w0, mi
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t6:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, #0.0
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t6:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, #0.0
+; CHECK-GI-NEXT:    cset w8, mi
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp olt half %a, 0xH0000
   %vcltz = sext i1 %0 to i16
@@ -172,15 +172,15 @@ entry:
 }
 
 define dso_local i16 @t16(half %a) {
-; SDISEL-LABEL: t16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcvtzs w0, h0
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcvtzs w0, h0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcvtzu w0, h0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcvtzu w0, h0
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fptoui half %a to i16
   ret i16 %0
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
index 5b08ef2..da70599 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,SDISEL
-; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -mattr=+v8.2a,+fullfp16  | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 
 declare half @llvm.aarch64.sisd.fabd.f16(half, half)
@@ -35,18 +35,18 @@ entry:
 }
 
 define dso_local i16 @t_vceqh_f16(half %a, half %b) {
-; SDISEL-LABEL: t_vceqh_f16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, h1
-; SDISEL-NEXT:    csetm w0, eq
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t_vceqh_f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, h1
+; CHECK-SD-NEXT:    csetm w0, eq
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t_vceqh_f16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, h1
-; GISEL-NEXT:    cset w8, eq
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t_vceqh_f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, h1
+; CHECK-GI-NEXT:    cset w8, eq
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp oeq half %a, %b
   %vcmpd = sext i1 %0 to i16
@@ -54,18 +54,18 @@ entry:
 }
 
 define dso_local i16 @t_vcgeh_f16(half %a, half %b) {
-; SDISEL-LABEL: t_vcgeh_f16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, h1
-; SDISEL-NEXT:    csetm w0, ge
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t_vcgeh_f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, h1
+; CHECK-SD-NEXT:    csetm w0, ge
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t_vcgeh_f16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, h1
-; GISEL-NEXT:    cset w8, ge
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t_vcgeh_f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, h1
+; CHECK-GI-NEXT:    cset w8, ge
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp oge half %a, %b
   %vcmpd = sext i1 %0 to i16
@@ -73,18 +73,18 @@ entry:
 }
 
 define dso_local i16 @t_vcgth_f16(half %a, half %b) {
-; SDISEL-LABEL: t_vcgth_f16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, h1
-; SDISEL-NEXT:    csetm w0, gt
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t_vcgth_f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, h1
+; CHECK-SD-NEXT:    csetm w0, gt
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t_vcgth_f16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, h1
-; GISEL-NEXT:    cset w8, gt
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t_vcgth_f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, h1
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp ogt half %a, %b
   %vcmpd = sext i1 %0 to i16
@@ -92,18 +92,18 @@ entry:
 }
 
 define dso_local i16 @t_vcleh_f16(half %a, half %b) {
-; SDISEL-LABEL: t_vcleh_f16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, h1
-; SDISEL-NEXT:    csetm w0, ls
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t_vcleh_f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, h1
+; CHECK-SD-NEXT:    csetm w0, ls
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t_vcleh_f16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, h1
-; GISEL-NEXT:    cset w8, ls
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t_vcleh_f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, h1
+; CHECK-GI-NEXT:    cset w8, ls
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp ole half %a, %b
   %vcmpd = sext i1 %0 to i16
@@ -111,18 +111,18 @@ entry:
 }
 
 define dso_local i16 @t_vclth_f16(half %a, half %b) {
-; SDISEL-LABEL: t_vclth_f16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fcmp h0, h1
-; SDISEL-NEXT:    csetm w0, mi
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: t_vclth_f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fcmp h0, h1
+; CHECK-SD-NEXT:    csetm w0, mi
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: t_vclth_f16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    fcmp h0, h1
-; GISEL-NEXT:    cset w8, mi
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: t_vclth_f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fcmp h0, h1
+; CHECK-GI-NEXT:    cset w8, mi
+; CHECK-GI-NEXT:    sbfx w0, w8, #0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = fcmp olt half %a, %b
   %vcmpd = sext i1 %0 to i16
@@ -187,18 +187,18 @@ declare half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32, i32) #1
 declare i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half, i32) #1
 
 define dso_local half @test_vcvth_n_f16_s16_1(i16 %a) {
-; SDISEL-LABEL: test_vcvth_n_f16_s16_1:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fmov s0, w0
-; SDISEL-NEXT:    scvtf h0, h0, #1
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvth_n_f16_s16_1:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    scvtf h0, h0, #1
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvth_n_f16_s16_1:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    sxth w8, w0
-; GISEL-NEXT:    fmov s0, w8
-; GISEL-NEXT:    scvtf h0, h0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvth_n_f16_s16_1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    scvtf h0, h0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %sext = sext i16 %a to i32
   %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %sext, i32 1)
@@ -206,18 +206,18 @@ entry:
 }
 
 define dso_local half @test_vcvth_n_f16_s16_16(i16 %a) {
-; SDISEL-LABEL: test_vcvth_n_f16_s16_16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fmov s0, w0
-; SDISEL-NEXT:    scvtf h0, h0, #16
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvth_n_f16_s16_16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    scvtf h0, h0, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvth_n_f16_s16_16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    sxth w8, w0
-; GISEL-NEXT:    fmov s0, w8
-; GISEL-NEXT:    scvtf h0, h0, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvth_n_f16_s16_16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    scvtf h0, h0, #16
+; CHECK-GI-NEXT:    ret
 entry:
   %sext = sext i16 %a to i32
   %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %sext, i32 16)
@@ -315,18 +315,18 @@ entry:
 }
 
 define dso_local half @test_vcvth_n_f16_u16_1(i16 %a) {
-; SDISEL-LABEL: test_vcvth_n_f16_u16_1:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fmov s0, w0
-; SDISEL-NEXT:    ucvtf h0, h0, #1
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvth_n_f16_u16_1:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ucvtf h0, h0, #1
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvth_n_f16_u16_1:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w0, #0xffff
-; GISEL-NEXT:    fmov s0, w8
-; GISEL-NEXT:    ucvtf h0, h0, #1
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvth_n_f16_u16_1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xffff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    ucvtf h0, h0, #1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = zext i16 %a to i32
   %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %0, i32 1)
@@ -334,18 +334,18 @@ entry:
 }
 
 define dso_local half @test_vcvth_n_f16_u16_16(i16 %a) {
-; SDISEL-LABEL: test_vcvth_n_f16_u16_16:
-; SDISEL:       // %bb.0: // %entry
-; SDISEL-NEXT:    fmov s0, w0
-; SDISEL-NEXT:    ucvtf h0, h0, #16
-; SDISEL-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvth_n_f16_u16_16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ucvtf h0, h0, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvth_n_f16_u16_16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w0, #0xffff
-; GISEL-NEXT:    fmov s0, w8
-; GISEL-NEXT:    ucvtf h0, h0, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvth_n_f16_u16_16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xffff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    ucvtf h0, h0, #16
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = zext i16 %a to i32
   %fcvth_n = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %0, i32 16)
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index ae2ef26..4c28c90 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -2509,88 +2509,87 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
 ;
 ; CHECK-GI-LABEL: fshl_v7i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr s17, [sp, #48]
-; CHECK-GI-NEXT:    add x8, sp, #56
-; CHECK-GI-NEXT:    add x9, sp, #64
+; CHECK-GI-NEXT:    ldr s3, [sp, #48]
+; CHECK-GI-NEXT:    ldr s20, [sp, #56]
+; CHECK-GI-NEXT:    add x9, sp, #56
 ; CHECK-GI-NEXT:    ldr s4, [sp, #48]
-; CHECK-GI-NEXT:    ldr s21, [sp, #56]
-; CHECK-GI-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    ld1 { v17.s }[1], [x8]
-; CHECK-GI-NEXT:    ldr s20, [x9]
-; CHECK-GI-NEXT:    add x8, sp, #72
-; CHECK-GI-NEXT:    mov v4.s[1], v21.s[0]
+; CHECK-GI-NEXT:    ldr s7, [sp, #80]
+; CHECK-GI-NEXT:    mov w12, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    ldr s21, [sp, #88]
+; CHECK-GI-NEXT:    mov v3.s[1], v20.s[0]
+; CHECK-GI-NEXT:    fmov s20, w12
+; CHECK-GI-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-GI-NEXT:    ldr s17, [sp]
+; CHECK-GI-NEXT:    add x13, sp, #64
+; CHECK-GI-NEXT:    mov v7.s[1], v21.s[0]
 ; CHECK-GI-NEXT:    fmov s21, w7
-; CHECK-GI-NEXT:    ldr s6, [sp]
-; CHECK-GI-NEXT:    ld1 { v20.s }[1], [x8]
 ; CHECK-GI-NEXT:    ldr s19, [sp, #64]
-; CHECK-GI-NEXT:    ldr s7, [sp, #80]
-; CHECK-GI-NEXT:    ldr s22, [sp, #88]
-; CHECK-GI-NEXT:    mov w9, #31 // =0x1f
-; CHECK-GI-NEXT:    mov w11, #1 // =0x1
-; CHECK-GI-NEXT:    mov v21.s[1], v6.s[0]
-; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    mov w11, #31 // =0x1f
+; CHECK-GI-NEXT:    mov v20.s[1], w12
 ; CHECK-GI-NEXT:    ldr s18, [sp, #96]
-; CHECK-GI-NEXT:    zip1 v17.2d, v17.2d, v20.2d
-; CHECK-GI-NEXT:    fmov s20, w10
-; CHECK-GI-NEXT:    mov v7.s[1], v22.s[0]
-; CHECK-GI-NEXT:    mov v4.s[2], v19.s[0]
-; CHECK-GI-NEXT:    fmov s19, w11
+; CHECK-GI-NEXT:    ld1 { v4.s }[2], [x13]
+; CHECK-GI-NEXT:    mov w13, #1 // =0x1
+; CHECK-GI-NEXT:    mov v3.s[2], v19.s[0]
+; CHECK-GI-NEXT:    mov v21.s[1], v17.s[0]
+; CHECK-GI-NEXT:    fmov s17, w11
+; CHECK-GI-NEXT:    fmov s19, w13
 ; CHECK-GI-NEXT:    fmov s23, w0
-; CHECK-GI-NEXT:    mov v6.s[1], w9
-; CHECK-GI-NEXT:    fmov s24, w9
-; CHECK-GI-NEXT:    ldr s2, [sp, #8]
-; CHECK-GI-NEXT:    mov v20.s[1], w10
+; CHECK-GI-NEXT:    fmov s24, w11
+; CHECK-GI-NEXT:    ldr s6, [sp, #8]
 ; CHECK-GI-NEXT:    ldr s0, [sp, #24]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #32]
-; CHECK-GI-NEXT:    mov v19.s[1], w11
 ; CHECK-GI-NEXT:    mov v7.s[2], v18.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], w11
+; CHECK-GI-NEXT:    mov v19.s[1], w13
+; CHECK-GI-NEXT:    mov v20.s[2], w12
 ; CHECK-GI-NEXT:    ldr s16, [sp, #72]
 ; CHECK-GI-NEXT:    mov v23.s[1], w1
 ; CHECK-GI-NEXT:    ldr s18, [sp, #80]
-; CHECK-GI-NEXT:    mov v21.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v24.s[1], w9
+; CHECK-GI-NEXT:    mov v21.s[2], v6.s[0]
+; CHECK-GI-NEXT:    mov v24.s[1], w11
 ; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
-; CHECK-GI-NEXT:    fmov s5, w4
-; CHECK-GI-NEXT:    mov v20.s[2], w10
-; CHECK-GI-NEXT:    add x8, sp, #88
+; CHECK-GI-NEXT:    fmov s6, w4
+; CHECK-GI-NEXT:    add x10, sp, #88
 ; CHECK-GI-NEXT:    movi v22.4s, #31
-; CHECK-GI-NEXT:    mov v4.s[3], v16.s[0]
-; CHECK-GI-NEXT:    mov v6.s[2], w9
-; CHECK-GI-NEXT:    mov v19.s[2], w11
-; CHECK-GI-NEXT:    ldr s1, [sp, #16]
-; CHECK-GI-NEXT:    ldr s3, [sp, #40]
-; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x8]
+; CHECK-GI-NEXT:    mov v3.s[3], v16.s[0]
+; CHECK-GI-NEXT:    mov v17.s[2], w11
+; CHECK-GI-NEXT:    mov v19.s[2], w13
+; CHECK-GI-NEXT:    ldr s2, [sp, #16]
+; CHECK-GI-NEXT:    ldr s1, [sp, #40]
+; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x10]
+; CHECK-GI-NEXT:    eor v5.16b, v7.16b, v20.16b
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    mov v5.s[1], w5
-; CHECK-GI-NEXT:    add x8, sp, #96
-; CHECK-GI-NEXT:    eor v2.16b, v7.16b, v20.16b
-; CHECK-GI-NEXT:    mov v21.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v24.s[2], w9
-; CHECK-GI-NEXT:    mov v0.s[2], v3.s[0]
-; CHECK-GI-NEXT:    bic v1.16b, v22.16b, v4.16b
-; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x8]
+; CHECK-GI-NEXT:    mov v6.s[1], w5
+; CHECK-GI-NEXT:    add x8, sp, #72
+; CHECK-GI-NEXT:    add x9, sp, #96
+; CHECK-GI-NEXT:    mov v21.s[3], v2.s[0]
+; CHECK-GI-NEXT:    mov v24.s[2], w11
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    ld1 { v4.s }[3], [x8]
+; CHECK-GI-NEXT:    bic v2.16b, v22.16b, v3.16b
+; CHECK-GI-NEXT:    ld1 { v18.s }[2], [x9]
+; CHECK-GI-NEXT:    and v1.16b, v5.16b, v17.16b
 ; CHECK-GI-NEXT:    neg v3.4s, v19.4s
-; CHECK-GI-NEXT:    and v4.16b, v17.16b, v22.16b
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v6.16b
 ; CHECK-GI-NEXT:    mov v23.s[3], w3
-; CHECK-GI-NEXT:    mov v5.s[2], w6
-; CHECK-GI-NEXT:    ushr v6.4s, v21.4s, #1
-; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    mov v6.s[2], w6
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v22.16b
+; CHECK-GI-NEXT:    ushr v5.4s, v21.4s, #1
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
 ; CHECK-GI-NEXT:    and v7.16b, v18.16b, v24.16b
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v3.4s
-; CHECK-GI-NEXT:    neg v2.4s, v2.4s
 ; CHECK-GI-NEXT:    ushl v3.4s, v23.4s, v4.4s
-; CHECK-GI-NEXT:    ushl v1.4s, v6.4s, v1.4s
-; CHECK-GI-NEXT:    ushl v4.4s, v5.4s, v7.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    ushl v2.4s, v5.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v4.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v2.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v4.16b, v0.16b
 ; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NEXT:    mov s4, v1.s[3]
-; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    mov s5, v0.s[1]
 ; CHECK-GI-NEXT:    mov s6, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s1
 ; CHECK-GI-NEXT:    fmov w4, s0
 ; CHECK-GI-NEXT:    fmov w1, s2
 ; CHECK-GI-NEXT:    fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
new file mode 100644
index 0000000..c4a027c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -tail-dup-pred-size=2 -tail-dup-succ-size=2 -o - %s | FileCheck %s
+
+target triple = "arm64-apple-macosx13.0.0"
+
+@opcode.targets = local_unnamed_addr constant [6 x ptr] [ptr blockaddress(@test_interp, %op1.bb), ptr blockaddress(@test_interp, %op6.bb), ptr blockaddress(@test_interp, %loop.header), ptr blockaddress(@test_interp, %op2.bb), ptr blockaddress(@test_interp, %op4.bb), ptr blockaddress(@test_interp, %op5.bb)]
+
+define void @test_interp(ptr %frame, ptr %dst) {
+; CHECK-LABEL: test_interp:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    stp x24, x23, [sp, #-64]! ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset w19, -24
+; CHECK-NEXT:    .cfi_offset w20, -32
+; CHECK-NEXT:    .cfi_offset w21, -40
+; CHECK-NEXT:    .cfi_offset w22, -48
+; CHECK-NEXT:    .cfi_offset w23, -56
+; CHECK-NEXT:    .cfi_offset w24, -64
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x21, _opcode.targets@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    add x21, x21, _opcode.targets@PAGEOFF
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    add x8, x21, xzr, lsl #3
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x23, x22, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp0: ; Block address taken
+; CHECK-NEXT:  LBB0_1: ; %loop.header
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x8, x21, x23, lsl #3
+; CHECK-NEXT:    mov x20, xzr
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp1: ; Block address taken
+; CHECK-NEXT:  LBB0_2: ; %op1.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    str xzr, [x19]
+; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    ldr x0, [x20, #-8]!
+; CHECK-NEXT:    ldr x9, [x0, #8]
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ldr x8, [x9, #48]
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    add x8, x21, x23, lsl #3
+; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp2: ; Block address taken
+; CHECK-NEXT:  LBB0_3: ; %op2.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x8, x21, x23, lsl #3
+; CHECK-NEXT:    mov x20, xzr
+; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    str x22, [x19]
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp3: ; Block address taken
+; CHECK-NEXT:  LBB0_4: ; %op4.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    str x22, [x19]
+; CHECK-NEXT:    add x10, x21, x23, lsl #3
+; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    ldur x8, [x22, #12]
+; CHECK-NEXT:    ldur x9, [x20, #-8]
+; CHECK-NEXT:    add x22, x22, #20
+; CHECK-NEXT:    stp x8, x9, [x20, #-8]
+; CHECK-NEXT:    add x20, x20, #8
+; CHECK-NEXT:    br x10
+; CHECK-NEXT:  Ltmp4: ; Block address taken
+; CHECK-NEXT:  LBB0_5: ; %op5.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    str x22, [x19]
+; CHECK-NEXT:    add x10, x21, x23, lsl #3
+; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    ldur x8, [x22, #12]
+; CHECK-NEXT:    ldur x9, [x20, #-8]
+; CHECK-NEXT:    add x22, x22, #20
+; CHECK-NEXT:    stp x8, x9, [x20, #-8]
+; CHECK-NEXT:    add x20, x20, #8
+; CHECK-NEXT:    br x10
+; CHECK-NEXT:  Ltmp5: ; Block address taken
+; CHECK-NEXT:  LBB0_6: ; %op6.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr x0, [x20, #-8]!
+; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    ldr x9, [x0, #8]
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ldr x8, [x9, #48]
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    add x8, x21, x23, lsl #3
+; CHECK-NEXT:    add x23, x23, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:    .loh AdrpAdd Lloh0, Lloh1
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %op1.bb ], [ %iv.next, %op2.bb ], [ %iv.next, %op4.bb ], [ %iv.next, %op5.bb ], [ %iv.next, %op6.bb ], [ %iv.next, %loop.header ]
+  %stack.pointer = phi ptr [ %frame, %entry ], [ %stack.8, %op1.bb ], [ null, %op2.bb ], [ %stack.next, %op4.bb ], [ %stack.next.2, %op5.bb ], [ %stack.4, %op6.bb ], [ null, %loop.header ]
+  %next.instr = phi ptr [ null, %entry ], [ %next.instr, %op1.bb ], [ null, %op2.bb ], [ %next.instr.20, %op4.bb ], [ %next.instr.21, %op5.bb ], [ %next.instr, %op6.bb ], [ null, %loop.header ]
+  %iv.next = add i64 %iv, 1
+  %next_op = getelementptr [6 x ptr], ptr @opcode.targets, i64 0, i64 %iv
+  indirectbr ptr %next_op, [label %op1.bb, label %op6.bb, label %loop.header, label %op2.bb, label %op4.bb, label %op5.bb]
+
+op1.bb:
+  store ptr null, ptr %dst, align 8
+  %stack.8 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.0 = load ptr, ptr %stack.8, align 8
+  store i64 1, ptr %l.0, align 8
+  %gep.0 = getelementptr i8, ptr %l.0, i64 8
+  %l.1 = load ptr, ptr %gep.0, align 8
+  %gep.1 = getelementptr i8, ptr %l.1, i64 48
+  %l.2 = load ptr, ptr %gep.1, align 8
+  tail call void %l.2(ptr nonnull %l.0)
+  br label %loop.header
+
+op2.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  br label %loop.header
+
+op4.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  %next.instr.20 = getelementptr i8, ptr %next.instr, i64 20
+  %stack.2 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.3 = load ptr, ptr %stack.2, align 8
+  %next.instr.12 = getelementptr i8, ptr %next.instr, i64 12
+  %next.instr.12.val = load ptr, ptr %next.instr.12, align 2
+  store ptr %next.instr.12.val, ptr %stack.2, align 8
+  store ptr %l.3, ptr %stack.pointer, align 8
+  %stack.next = getelementptr i8, ptr %stack.pointer, i64 8
+  br label %loop.header
+
+op5.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  %next.instr.21 = getelementptr i8, ptr %next.instr, i64 20
+  %stack.3 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.4 = load ptr, ptr %stack.3, align 8
+  %next.instr.2 = getelementptr i8, ptr %next.instr, i64 12
+  %next.instr.2.val = load ptr, ptr %next.instr.2, align 2
+  store ptr %next.instr.2.val, ptr %stack.3, align 8
+  store ptr %l.4, ptr %stack.pointer, align 8
+  %stack.next.2 = getelementptr i8, ptr %stack.pointer, i64 8
+  br label %loop.header
+
+op6.bb:
+  %stack.4 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.5 = load ptr, ptr %stack.4, align 8
+  store i64 1, ptr %l.5, align 8
+  %gep.5 = getelementptr i8, ptr %l.5, i64 8
+  %l.6 = load ptr, ptr %gep.5, align 8
+  %gep.6 = getelementptr i8, ptr %l.6, i64 48
+  %l.7 = load ptr, ptr %gep.6, align 8
+  tail call void %l.7(ptr nonnull %l.5)
+  br label %loop.header
+}
diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
index 4e1876d..2213aa1 100644
--- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
@@ -700,14 +700,13 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; CHECK-NEXT:    ldr s1, [sp, #44]
 ; CHECK-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    mov v2.s[3], v0.s[0]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
-; CHECK-NEXT:    ldr s0, [x20]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x21]
+; CHECK-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-NEXT:    ld1 { v1.s }[2], [x20]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    zip1 v1.2d, v1.2d, v0.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ld1 { v1.s }[3], [x21]
+; CHECK-NEXT:    ldp x30, x21, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 ;
@@ -873,11 +872,10 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    bl frexpf
 ; CHECK-NEXT:    ldr s0, [sp, #28]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ldr s1, [x20]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x21]
+; CHECK-NEXT:    ld1 { v0.s }[2], [x20]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ld1 { v0.s }[3], [x21]
 ; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-reassociate.mir b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate.mir
index 525f6dd..184c9ef 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-reassociate.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-reassociate.mir
@@ -1,14 +1,11 @@
-# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAFE
-# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-unknown-linux-gnu -enable-unsafe-fp-math %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE
+# RUN: llc -run-pass=machine-combiner -mtriple=aarch64-unknown-linux-gnu %s -o - | FileCheck %s
 
 # fadd without the reassoc flags can be reassociate only when unsafe fp math is
 # enabled.
 # CHECK-LABEL: name: fadd_no_reassoc
 # CHECK:             [[ADD1:%[0-9]+]]:fpr32 = FADDSrr %0, %1, implicit $fpcr
-# CHECK-SAFE-NEXT:   [[ADD2:%[0-9]+]]:fpr32 = FADDSrr killed [[ADD1]], %2, implicit $fpcr
-# CHECK-SAFE-NEXT:   [[ADD3:%[0-9]+]]:fpr32 = FADDSrr killed [[ADD2]], %3, implicit $fpcr
-# CHECK-UNSAFE-NEXT: [[ADD2:%[0-9]+]]:fpr32 = FADDSrr %2, %3, implicit $fpcr
-# CHECK-UNSAFE-NEXT: [[ADD3:%[0-9]+]]:fpr32 = FADDSrr killed [[ADD1]], killed [[ADD2]], implicit $fpcr
+# CHECK:             [[ADD2:%[0-9]+]]:fpr32 = FADDSrr killed [[ADD1]], %2, implicit $fpcr
+# CHECK:             [[ADD3:%[0-9]+]]:fpr32 = FADDSrr killed [[ADD2]], %3, implicit $fpcr
 ---
 name:            fadd_no_reassoc
 alignment:       4
@@ -49,10 +46,9 @@ body:             |
 # the reassoc flag is ignored.
 # CHECK-LABEL: name: fadd_reassoc
 # CHECK:             [[ADD1:%[0-9]+]]:fpr32 = reassoc FADDSrr %0, %1, implicit $fpcr
-# CHECK-SAFE-NEXT:   [[ADD2:%[0-9]+]]:fpr32 = reassoc FADDSrr killed [[ADD1]], %2, implicit $fpcr
-# CHECK-SAFE-NEXT:   [[ADD3:%[0-9]+]]:fpr32 = reassoc FADDSrr killed [[ADD2]], %3, implicit $fpcr
-# CHECK-UNSAFE-NEXT: [[ADD2:%[0-9]+]]:fpr32 = reassoc FADDSrr %2, %3, implicit $fpcr
-# CHECK-UNSAFE-NEXT: [[ADD3:%[0-9]+]]:fpr32 = reassoc FADDSrr killed [[ADD1]], killed [[ADD2]], implicit $fpcr
+# CHECK:             [[ADD2:%[0-9]+]]:fpr32 = reassoc FADDSrr killed [[ADD1]], %2, implicit $fpcr
+# CHECK:             [[ADD3:%[0-9]+]]:fpr32 = reassoc FADDSrr killed [[ADD2]], %3, implicit $fpcr
+
 ---
 name:            fadd_reassoc
 alignment:       4
@@ -92,10 +88,8 @@ body:             |
 # Check that flags on the instructions are preserved after reassociation.
 # CHECK-LABEL: name: fadd_flags
 # CHECK:             [[ADD1:%[0-9]+]]:fpr32 = nnan ninf nsz FADDSrr %0, %1, implicit $fpcr
-# CHECK-SAFE-NEXT:   [[ADD2:%[0-9]+]]:fpr32 = nnan nsz FADDSrr killed [[ADD1]], %2, implicit $fpcr
-# CHECK-SAFE-NEXT:   [[ADD3:%[0-9]+]]:fpr32 = ninf nsz FADDSrr killed [[ADD2]], %3, implicit $fpcr
-# CHECK-UNSAFE-NEXT: [[ADD2:%[0-9]+]]:fpr32 = nsz FADDSrr %2, %3, implicit $fpcr
-# CHECK-UNSAFE-NEXT: [[ADD3:%[0-9]+]]:fpr32 = nsz FADDSrr killed [[ADD1]], killed [[ADD2]], implicit $fpcr
+# CHECK:             [[ADD2:%[0-9]+]]:fpr32 = nnan nsz FADDSrr killed [[ADD1]], %2, implicit $fpcr
+# CHECK:             [[ADD3:%[0-9]+]]:fpr32 = ninf nsz FADDSrr killed [[ADD2]], %3, implicit $fpcr
 ---
 name:            fadd_flags
 alignment:       4
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll
index ec61fee..65afd92 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll
@@ -1,29 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-STD
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -enable-unsafe-fp-math < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 < %s | FileCheck %s
 
 ; Incremental updates of the instruction depths should be enough for this test
 ; case.
-; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -enable-unsafe-fp-math \
-; RUN:     -machine-combiner-inc-threshold=0 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 \
+; RUN:     -machine-combiner-inc-threshold=0 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s
 
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 
 define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-STD-LABEL: reassociate_adds1:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd s0, s0, s1
-; CHECK-STD-NEXT:    fadd s0, s0, s2
-; CHECK-STD-NEXT:    fadd s0, s0, s3
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds1:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s2, s3
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s0, s3
+; CHECK-NEXT:    ret
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -44,110 +36,110 @@ define float @reassociate_adds1_fast(float %x0, float %x1, float %x2, float %x3)
 }
 
 define float @reassociate_adds1_reassoc(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-STD-LABEL: reassociate_adds1_reassoc:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd s0, s0, s1
-; CHECK-STD-NEXT:    fadd s0, s0, s2
-; CHECK-STD-NEXT:    fadd s0, s0, s3
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds1_reassoc:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s2, s3
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    ret
-  %t0 = fadd reassoc float %x0, %x1
-  %t1 = fadd reassoc float %t0, %x2
-  %t2 = fadd reassoc float %t1, %x3
+; CHECK-LABEL: reassociate_adds1_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s1, s2, s3
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz float %x0, %x1
+  %t1 = fadd reassoc nsz float %t0, %x2
+  %t2 = fadd reassoc nsz float %t1, %x3
   ret float %t2
 }
 
 define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-STD-LABEL: reassociate_adds2:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd s0, s0, s1
-; CHECK-STD-NEXT:    fadd s0, s2, s0
-; CHECK-STD-NEXT:    fadd s0, s0, s3
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds2:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s2, s3
-; CHECK-UNSAFE-NEXT:    fadd s0, s1, s0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s0, s2, s0
+; CHECK-NEXT:    fadd s0, s0, s3
+; CHECK-NEXT:    ret
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %t1, %x3
   ret float %t2
 }
 
+define float @reassociate_adds2_reassoc(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s1, s2, s3
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz float %x0, %x1
+  %t1 = fadd reassoc nsz float %x2, %t0
+  %t2 = fadd reassoc nsz float %t1, %x3
+  ret float %t2
+}
+
 define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-STD-LABEL: reassociate_adds3:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd s0, s0, s1
-; CHECK-STD-NEXT:    fadd s0, s0, s2
-; CHECK-STD-NEXT:    fadd s0, s3, s0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds3:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s3, s2
-; CHECK-UNSAFE-NEXT:    fadd s0, s1, s0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s3, s0
+; CHECK-NEXT:    ret
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %x3, %t1
   ret float %t2
 }
 
+define float @reassociate_adds3_reassoc(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s1, s3, s2
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz float %x0, %x1
+  %t1 = fadd reassoc nsz float %t0, %x2
+  %t2 = fadd reassoc nsz float %x3, %t1
+  ret float %t2
+}
+
 define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-STD-LABEL: reassociate_adds4:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd s0, s0, s1
-; CHECK-STD-NEXT:    fadd s0, s2, s0
-; CHECK-STD-NEXT:    fadd s0, s3, s0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds4:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s3, s2
-; CHECK-UNSAFE-NEXT:    fadd s0, s1, s0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s0, s2, s0
+; CHECK-NEXT:    fadd s0, s3, s0
+; CHECK-NEXT:    ret
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1
   ret float %t2
 }
 
+define float @reassociate_adds4_reassoc(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds4_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s1, s3, s2
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz float %x0, %x1
+  %t1 = fadd reassoc nsz float %x2, %t0
+  %t2 = fadd reassoc nsz float %x3, %t1
+  ret float %t2
+}
+
 ; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
 ; produced because that would cost more compile time.
 
 define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-STD-LABEL: reassociate_adds5:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd s0, s0, s1
-; CHECK-STD-NEXT:    fadd s0, s0, s2
-; CHECK-STD-NEXT:    fadd s0, s0, s3
-; CHECK-STD-NEXT:    fadd s0, s0, s4
-; CHECK-STD-NEXT:    fadd s0, s0, s5
-; CHECK-STD-NEXT:    fadd s0, s0, s6
-; CHECK-STD-NEXT:    fadd s0, s0, s7
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds5:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s2, s3
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s4, s5
-; CHECK-UNSAFE-NEXT:    fadd s1, s1, s6
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s0, s0, s7
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s0, s3
+; CHECK-NEXT:    fadd s0, s0, s4
+; CHECK-NEXT:    fadd s0, s0, s5
+; CHECK-NEXT:    fadd s0, s0, s6
+; CHECK-NEXT:    fadd s0, s0, s7
+; CHECK-NEXT:    ret
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -158,141 +150,198 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa
   ret float %t6
 }
 
+define float @reassociate_adds5_reassoc(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds5_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s1, s2, s3
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s1, s4, s5
+; CHECK-NEXT:    fadd s1, s1, s6
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fadd s0, s0, s7
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz float %x0, %x1
+  %t1 = fadd reassoc nsz float %t0, %x2
+  %t2 = fadd reassoc nsz float %t1, %x3
+  %t3 = fadd reassoc nsz float %t2, %x4
+  %t4 = fadd reassoc nsz float %t3, %x5
+  %t5 = fadd reassoc nsz float %t4, %x6
+  %t6 = fadd reassoc nsz float %t5, %x7
+  ret float %t6
+}
+
 ; Verify that we only need two associative operations to reassociate the operands.
 ; Also, we should reassociate such that the result of the high latency division
 ; is used by the final 'add' rather than reassociating the %x3 operand with the
 ; division. The latter reassociation would not improve anything.
 
 define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-STD-LABEL: reassociate_adds6:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fdiv s0, s0, s1
-; CHECK-STD-NEXT:    fadd s0, s2, s0
-; CHECK-STD-NEXT:    fadd s0, s3, s0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds6:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fdiv s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fadd s1, s3, s2
-; CHECK-UNSAFE-NEXT:    fadd s0, s1, s0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds6:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    fadd s0, s2, s0
+; CHECK-NEXT:    fadd s0, s3, s0
+; CHECK-NEXT:    ret
   %t0 = fdiv float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1
   ret float %t2
 }
 
+define float @reassociate_adds6_reassoc(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds6_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    fadd s1, s3, s2
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %t0 = fdiv reassoc nsz float %x0, %x1
+  %t1 = fadd reassoc nsz float %x2, %t0
+  %t2 = fadd reassoc nsz float %x3, %t1
+  ret float %t2
+}
+
 ; Verify that scalar single-precision multiplies are reassociated.
 
 define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-STD-LABEL: reassociate_muls1:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fdiv s0, s0, s1
-; CHECK-STD-NEXT:    fmul s0, s2, s0
-; CHECK-STD-NEXT:    fmul s0, s3, s0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_muls1:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fdiv s0, s0, s1
-; CHECK-UNSAFE-NEXT:    fmul s1, s3, s2
-; CHECK-UNSAFE-NEXT:    fmul s0, s1, s0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_muls1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    fmul s0, s2, s0
+; CHECK-NEXT:    fmul s0, s3, s0
+; CHECK-NEXT:    ret
   %t0 = fdiv float %x0, %x1
   %t1 = fmul float %x2, %t0
   %t2 = fmul float %x3, %t1
   ret float %t2
 }
 
+define float @reassociate_muls1_reassoc(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_muls1_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    fmul s1, s3, s2
+; CHECK-NEXT:    fmul s0, s1, s0
+; CHECK-NEXT:    ret
+  %t0 = fdiv reassoc nsz float %x0, %x1
+  %t1 = fmul reassoc nsz float %x2, %t0
+  %t2 = fmul reassoc nsz float %x3, %t1
+  ret float %t2
+}
+
 ; Verify that scalar double-precision adds are reassociated.
 
 define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) {
-; CHECK-STD-LABEL: reassociate_adds_double:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fdiv d0, d0, d1
-; CHECK-STD-NEXT:    fadd d0, d2, d0
-; CHECK-STD-NEXT:    fadd d0, d3, d0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds_double:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fdiv d0, d0, d1
-; CHECK-UNSAFE-NEXT:    fadd d1, d3, d2
-; CHECK-UNSAFE-NEXT:    fadd d0, d1, d0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    fadd d0, d2, d0
+; CHECK-NEXT:    fadd d0, d3, d0
+; CHECK-NEXT:    ret
   %t0 = fdiv double %x0, %x1
   %t1 = fadd double %x2, %t0
   %t2 = fadd double %x3, %t1
   ret double %t2
 }
 
+define double @reassociate_adds_double_reassoc(double %x0, double %x1, double %x2, double %x3) {
+; CHECK-LABEL: reassociate_adds_double_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    fadd d1, d3, d2
+; CHECK-NEXT:    fadd d0, d1, d0
+; CHECK-NEXT:    ret
+  %t0 = fdiv reassoc nsz double %x0, %x1
+  %t1 = fadd reassoc nsz double %x2, %t0
+  %t2 = fadd reassoc nsz double %x3, %t1
+  ret double %t2
+}
+
 ; Verify that scalar double-precision multiplies are reassociated.
 
 define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) {
-; CHECK-STD-LABEL: reassociate_muls_double:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fdiv d0, d0, d1
-; CHECK-STD-NEXT:    fmul d0, d2, d0
-; CHECK-STD-NEXT:    fmul d0, d3, d0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_muls_double:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fdiv d0, d0, d1
-; CHECK-UNSAFE-NEXT:    fmul d1, d3, d2
-; CHECK-UNSAFE-NEXT:    fmul d0, d1, d0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_muls_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    fmul d0, d2, d0
+; CHECK-NEXT:    fmul d0, d3, d0
+; CHECK-NEXT:    ret
   %t0 = fdiv double %x0, %x1
   %t1 = fmul double %x2, %t0
   %t2 = fmul double %x3, %t1
   ret double %t2
 }
 
+define double @reassociate_muls_double_reassoc(double %x0, double %x1, double %x2, double %x3) {
+; CHECK-LABEL: reassociate_muls_double_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    fmul d1, d3, d2
+; CHECK-NEXT:    fmul d0, d1, d0
+; CHECK-NEXT:    ret
+  %t0 = fdiv reassoc nsz double %x0, %x1
+  %t1 = fmul reassoc nsz double %x2, %t0
+  %t2 = fmul reassoc nsz double %x3, %t1
+  ret double %t2
+}
+
 ; Verify that scalar half-precision adds are reassociated.
 
 define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) {
-; CHECK-STD-LABEL: reassociate_adds_half:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fdiv h0, h0, h1
-; CHECK-STD-NEXT:    fadd h0, h2, h0
-; CHECK-STD-NEXT:    fadd h0, h3, h0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds_half:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT:    fadd h1, h3, h2
-; CHECK-UNSAFE-NEXT:    fadd h0, h1, h0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds_half:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv h0, h0, h1
+; CHECK-NEXT:    fadd h0, h2, h0
+; CHECK-NEXT:    fadd h0, h3, h0
+; CHECK-NEXT:    ret
   %t0 = fdiv half %x0, %x1
   %t1 = fadd half %x2, %t0
   %t2 = fadd half %x3, %t1
   ret half %t2
 }
 
+define half @reassociate_adds_half_reassoc(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_adds_half_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv h0, h0, h1
+; CHECK-NEXT:    fadd h1, h3, h2
+; CHECK-NEXT:    fadd h0, h1, h0
+; CHECK-NEXT:    ret
+  %t0 = fdiv reassoc nsz half %x0, %x1
+  %t1 = fadd reassoc nsz half %x2, %t0
+  %t2 = fadd reassoc nsz half %x3, %t1
+  ret half %t2
+}
+
 ; Verify that scalar half-precision multiplies are reassociated.
 
 define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) {
-; CHECK-STD-LABEL: reassociate_muls_half:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fdiv h0, h0, h1
-; CHECK-STD-NEXT:    fmul h0, h2, h0
-; CHECK-STD-NEXT:    fmul h0, h3, h0
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_muls_half:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT:    fmul h1, h3, h2
-; CHECK-UNSAFE-NEXT:    fmul h0, h1, h0
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_muls_half:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv h0, h0, h1
+; CHECK-NEXT:    fmul h0, h2, h0
+; CHECK-NEXT:    fmul h0, h3, h0
+; CHECK-NEXT:    ret
   %t0 = fdiv half %x0, %x1
   %t1 = fmul half %x2, %t0
   %t2 = fmul half %x3, %t1
   ret half %t2
 }
 
+define half @reassociate_muls_half_reassoc(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_muls_half_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdiv h0, h0, h1
+; CHECK-NEXT:    fmul h1, h3, h2
+; CHECK-NEXT:    fmul h0, h1, h0
+; CHECK-NEXT:    ret
+  %t0 = fdiv reassoc nsz half %x0, %x1
+  %t1 = fmul reassoc nsz half %x2, %t0
+  %t2 = fmul reassoc nsz half %x3, %t1
+  ret half %t2
+}
+
 ; Verify that scalar integer adds are reassociated.
 
 define i32 @reassociate_adds_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
@@ -365,173 +414,222 @@ define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; Verify that we reassociate vector instructions too.
 
 define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
-; CHECK-STD-LABEL: vector_reassociate_adds1:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v2.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: vector_reassociate_adds1:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-UNSAFE-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: vector_reassociate_adds1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ret
   %t0 = fadd <4 x float> %x0, %x1
   %t1 = fadd <4 x float> %t0, %x2
   %t2 = fadd <4 x float> %t1, %x3
   ret <4 x float> %t2
 }
 
+define <4 x float> @vector_reassociate_adds1_reassoc(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds1_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <4 x float> %x0, %x1
+  %t1 = fadd reassoc nsz <4 x float> %t0, %x2
+  %t2 = fadd reassoc nsz <4 x float> %t1, %x3
+  ret <4 x float> %t2
+}
+
 define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
-; CHECK-STD-LABEL: vector_reassociate_adds2:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v2.4s, v0.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: vector_reassociate_adds2:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-UNSAFE-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: vector_reassociate_adds2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ret
   %t0 = fadd <4 x float> %x0, %x1
   %t1 = fadd <4 x float> %x2, %t0
   %t2 = fadd <4 x float> %t1, %x3
   ret <4 x float> %t2
 }
 
+define <4 x float> @vector_reassociate_adds2_reassoc(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds2_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <4 x float> %x0, %x1
+  %t1 = fadd reassoc nsz <4 x float> %x2, %t0
+  %t2 = fadd reassoc nsz <4 x float> %t1, %x3
+  ret <4 x float> %t2
+}
+
 define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
-; CHECK-STD-LABEL: vector_reassociate_adds3:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v2.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v3.4s, v0.4s
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: vector_reassociate_adds3:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-UNSAFE-NEXT:    fadd v1.4s, v3.4s, v2.4s
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: vector_reassociate_adds3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    ret
   %t0 = fadd <4 x float> %x0, %x1
   %t1 = fadd <4 x float> %t0, %x2
   %t2 = fadd <4 x float> %x3, %t1
   ret <4 x float> %t2
 }
 
+define <4 x float> @vector_reassociate_adds3_reassoc(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds3_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v1.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <4 x float> %x0, %x1
+  %t1 = fadd reassoc nsz <4 x float> %t0, %x2
+  %t2 = fadd reassoc nsz <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
 define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
-; CHECK-STD-LABEL: vector_reassociate_adds4:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v2.4s, v0.4s
-; CHECK-STD-NEXT:    fadd v0.4s, v3.4s, v0.4s
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: vector_reassociate_adds4:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-UNSAFE-NEXT:    fadd v1.4s, v3.4s, v2.4s
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v1.4s, v0.4s
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: vector_reassociate_adds4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fadd v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    ret
   %t0 = fadd <4 x float> %x0, %x1
   %t1 = fadd <4 x float> %x2, %t0
   %t2 = fadd <4 x float> %x3, %t1
   ret <4 x float> %t2
 }
 
+define <4 x float> @vector_reassociate_adds4_reassoc(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: vector_reassociate_adds4_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v1.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <4 x float> %x0, %x1
+  %t1 = fadd reassoc nsz <4 x float> %x2, %t0
+  %t2 = fadd reassoc nsz <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
 ; Verify that 64-bit vector half-precision adds are reassociated.
 
 define <4 x half> @reassociate_adds_v4f16(<4 x half> %x0, <4 x half> %x1, <4 x half> %x2, <4 x half> %x3) {
-; CHECK-STD-LABEL: reassociate_adds_v4f16:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.4h, v0.4h, v1.4h
-; CHECK-STD-NEXT:    fadd v0.4h, v2.4h, v0.4h
-; CHECK-STD-NEXT:    fadd v0.4h, v3.4h, v0.4h
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds_v4f16:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.4h, v0.4h, v1.4h
-; CHECK-UNSAFE-NEXT:    fadd v1.4h, v3.4h, v2.4h
-; CHECK-UNSAFE-NEXT:    fadd v0.4h, v1.4h, v0.4h
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    fadd v0.4h, v2.4h, v0.4h
+; CHECK-NEXT:    fadd v0.4h, v3.4h, v0.4h
+; CHECK-NEXT:    ret
   %t0 = fadd <4 x half> %x0, %x1
   %t1 = fadd <4 x half> %x2, %t0
   %t2 = fadd <4 x half> %x3, %t1
   ret <4 x half> %t2
 }
 
+define <4 x half> @reassociate_adds_v4f16_reassoc(<4 x half> %x0, <4 x half> %x1, <4 x half> %x2, <4 x half> %x3) {
+; CHECK-LABEL: reassociate_adds_v4f16_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    fadd v1.4h, v3.4h, v2.4h
+; CHECK-NEXT:    fadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <4 x half> %x0, %x1
+  %t1 = fadd reassoc nsz <4 x half> %x2, %t0
+  %t2 = fadd reassoc nsz <4 x half> %x3, %t1
+  ret <4 x half> %t2
+}
+
 ; Verify that 128-bit vector half-precision multiplies are reassociated.
 
 define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
-; CHECK-STD-LABEL: reassociate_muls_v8f16:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.8h, v0.8h, v1.8h
-; CHECK-STD-NEXT:    fmul v0.8h, v2.8h, v0.8h
-; CHECK-STD-NEXT:    fmul v0.8h, v3.8h, v0.8h
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_muls_v8f16:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.8h, v0.8h, v1.8h
-; CHECK-UNSAFE-NEXT:    fmul v1.8h, v3.8h, v2.8h
-; CHECK-UNSAFE-NEXT:    fmul v0.8h, v1.8h, v0.8h
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_muls_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fmul v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    fmul v0.8h, v3.8h, v0.8h
+; CHECK-NEXT:    ret
   %t0 = fadd <8 x half> %x0, %x1
   %t1 = fmul <8 x half> %x2, %t0
   %t2 = fmul <8 x half> %x3, %t1
   ret <8 x half> %t2
 }
 
+define <8 x half> @reassociate_muls_v8f16_reassoc(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
+; CHECK-LABEL: reassociate_muls_v8f16_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fmul v1.8h, v3.8h, v2.8h
+; CHECK-NEXT:    fmul v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <8 x half> %x0, %x1
+  %t1 = fmul reassoc nsz <8 x half> %x2, %t0
+  %t2 = fmul reassoc nsz <8 x half> %x3, %t1
+  ret <8 x half> %t2
+}
+
 ; Verify that 128-bit vector single-precision multiplies are reassociated.
 
 define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
-; CHECK-STD-LABEL: reassociate_muls_v4f32:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-STD-NEXT:    fmul v0.4s, v2.4s, v0.4s
-; CHECK-STD-NEXT:    fmul v0.4s, v3.4s, v0.4s
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_muls_v4f32:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-UNSAFE-NEXT:    fmul v1.4s, v3.4s, v2.4s
-; CHECK-UNSAFE-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_muls_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fmul v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fmul v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    ret
   %t0 = fadd <4 x float> %x0, %x1
   %t1 = fmul <4 x float> %x2, %t0
   %t2 = fmul <4 x float> %x3, %t1
   ret <4 x float> %t2
 }
 
+define <4 x float> @reassociate_muls_v4f32_reassoc(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
+; CHECK-LABEL: reassociate_muls_v4f32_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fmul v1.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <4 x float> %x0, %x1
+  %t1 = fmul reassoc nsz <4 x float> %x2, %t0
+  %t2 = fmul reassoc nsz <4 x float> %x3, %t1
+  ret <4 x float> %t2
+}
+
 ; Verify that 128-bit vector double-precision multiplies are reassociated.
 
 define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
-; CHECK-STD-LABEL: reassociate_muls_v2f64:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd v0.2d, v0.2d, v1.2d
-; CHECK-STD-NEXT:    fmul v0.2d, v2.2d, v0.2d
-; CHECK-STD-NEXT:    fmul v0.2d, v3.2d, v0.2d
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_muls_v2f64:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd v0.2d, v0.2d, v1.2d
-; CHECK-UNSAFE-NEXT:    fmul v1.2d, v3.2d, v2.2d
-; CHECK-UNSAFE-NEXT:    fmul v0.2d, v1.2d, v0.2d
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_muls_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fmul v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    fmul v0.2d, v3.2d, v0.2d
+; CHECK-NEXT:    ret
   %t0 = fadd <2 x double> %x0, %x1
   %t1 = fmul <2 x double> %x2, %t0
   %t2 = fmul <2 x double> %x3, %t1
   ret <2 x double> %t2
 }
 
+define <2 x double> @reassociate_muls_v2f64_reassoc(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
+; CHECK-LABEL: reassociate_muls_v2f64_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fmul v1.2d, v3.2d, v2.2d
+; CHECK-NEXT:    fmul v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <2 x double> %x0, %x1
+  %t1 = fmul reassoc nsz <2 x double> %x2, %t0
+  %t2 = fmul reassoc nsz <2 x double> %x3, %t1
+  ret <2 x double> %t2
+}
+
+
 ; Verify that vector integer arithmetic operations are reassociated.
 
 define <2 x i32> @reassociate_muls_v2i32(<2 x i32> %x0, <2 x i32> %x1, <2 x i32> %x2, <2 x i32> %x3) {
@@ -606,65 +704,83 @@ define <4 x i32> @reassociate_xors_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>
 ; Verify that scalable vector FP arithmetic operations are reassociated.
 
 define <vscale x 8 x half> @reassociate_adds_nxv4f16(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2, <vscale x 8 x half> %x3) {
-; CHECK-STD-LABEL: reassociate_adds_nxv4f16:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd z0.h, z0.h, z1.h
-; CHECK-STD-NEXT:    fadd z0.h, z2.h, z0.h
-; CHECK-STD-NEXT:    fadd z0.h, z3.h, z0.h
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds_nxv4f16:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd z0.h, z0.h, z1.h
-; CHECK-UNSAFE-NEXT:    fadd z1.h, z3.h, z2.h
-; CHECK-UNSAFE-NEXT:    fadd z0.h, z1.h, z0.h
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    fadd z0.h, z2.h, z0.h
+; CHECK-NEXT:    fadd z0.h, z3.h, z0.h
+; CHECK-NEXT:    ret
   %t0 = fadd reassoc <vscale x 8 x half> %x0, %x1
   %t1 = fadd reassoc <vscale x 8 x half> %x2, %t0
   %t2 = fadd reassoc <vscale x 8 x half> %x3, %t1
   ret <vscale x 8 x half> %t2
 }
 
+define <vscale x 8 x half> @reassociate_adds_nxv4f16_nsz(<vscale x 8 x half> %x0, <vscale x 8 x half> %x1, <vscale x 8 x half> %x2, <vscale x 8 x half> %x3) {
+; CHECK-LABEL: reassociate_adds_nxv4f16_nsz:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    fadd z1.h, z3.h, z2.h
+; CHECK-NEXT:    fadd z0.h, z1.h, z0.h
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <vscale x 8 x half> %x0, %x1
+  %t1 = fadd reassoc nsz <vscale x 8 x half> %x2, %t0
+  %t2 = fadd reassoc nsz <vscale x 8 x half> %x3, %t1
+  ret <vscale x 8 x half> %t2
+}
+
 define <vscale x 4 x float> @reassociate_adds_nxv4f32(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2, <vscale x 4 x float> %x3) {
-; CHECK-STD-LABEL: reassociate_adds_nxv4f32:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fadd z0.s, z0.s, z1.s
-; CHECK-STD-NEXT:    fadd z0.s, z2.s, z0.s
-; CHECK-STD-NEXT:    fadd z0.s, z3.s, z0.s
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds_nxv4f32:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fadd z0.s, z0.s, z1.s
-; CHECK-UNSAFE-NEXT:    fadd z1.s, z3.s, z2.s
-; CHECK-UNSAFE-NEXT:    fadd z0.s, z1.s, z0.s
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z2.s, z0.s
+; CHECK-NEXT:    fadd z0.s, z3.s, z0.s
+; CHECK-NEXT:    ret
   %t0 = fadd reassoc <vscale x 4 x float> %x0, %x1
   %t1 = fadd reassoc <vscale x 4 x float> %x2, %t0
   %t2 = fadd reassoc <vscale x 4 x float> %x3, %t1
   ret <vscale x 4 x float> %t2
 }
 
+define <vscale x 4 x float> @reassociate_adds_nxv4f32_nsz(<vscale x 4 x float> %x0, <vscale x 4 x float> %x1, <vscale x 4 x float> %x2, <vscale x 4 x float> %x3) {
+; CHECK-LABEL: reassociate_adds_nxv4f32_nsz:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    fadd z1.s, z3.s, z2.s
+; CHECK-NEXT:    fadd z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %t0 = fadd reassoc nsz <vscale x 4 x float> %x0, %x1
+  %t1 = fadd reassoc nsz <vscale x 4 x float> %x2, %t0
+  %t2 = fadd reassoc nsz <vscale x 4 x float> %x3, %t1
+  ret <vscale x 4 x float> %t2
+}
+
 define <vscale x 2 x double> @reassociate_muls_nxv2f64(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2, <vscale x 2 x double> %x3) {
-; CHECK-STD-LABEL: reassociate_muls_nxv2f64:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    fmul z0.d, z0.d, z1.d
-; CHECK-STD-NEXT:    fmul z0.d, z2.d, z0.d
-; CHECK-STD-NEXT:    fmul z0.d, z3.d, z0.d
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_muls_nxv2f64:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    fmul z0.d, z0.d, z1.d
-; CHECK-UNSAFE-NEXT:    fmul z1.d, z3.d, z2.d
-; CHECK-UNSAFE-NEXT:    fmul z0.d, z1.d, z0.d
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_muls_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul z0.d, z0.d, z1.d
+; CHECK-NEXT:    fmul z0.d, z2.d, z0.d
+; CHECK-NEXT:    fmul z0.d, z3.d, z0.d
+; CHECK-NEXT:    ret
   %t0 = fmul reassoc <vscale x 2 x double> %x0, %x1
   %t1 = fmul reassoc <vscale x 2 x double> %x2, %t0
   %t2 = fmul reassoc <vscale x 2 x double> %x3, %t1
   ret <vscale x 2 x double> %t2
 }
 
+define <vscale x 2 x double> @reassociate_muls_nxv2f64_nsz(<vscale x 2 x double> %x0, <vscale x 2 x double> %x1, <vscale x 2 x double> %x2, <vscale x 2 x double> %x3) {
+; CHECK-LABEL: reassociate_muls_nxv2f64_nsz:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul z0.d, z0.d, z1.d
+; CHECK-NEXT:    fmul z1.d, z3.d, z2.d
+; CHECK-NEXT:    fmul z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+  %t0 = fmul reassoc nsz <vscale x 2 x double> %x0, %x1
+  %t1 = fmul reassoc nsz <vscale x 2 x double> %x2, %t0
+  %t2 = fmul reassoc nsz <vscale x 2 x double> %x3, %t1
+  ret <vscale x 2 x double> %t2
+}
+
 ; Verify that scalable vector integer arithmetic operations are reassociated.
 
 define <vscale x 16 x i8> @reassociate_muls_nxv16i8(<vscale x 16 x i8> %x0, <vscale x 16 x i8> %x1, <vscale x 16 x i8> %x2, <vscale x 16 x i8> %x3) {
@@ -753,55 +869,30 @@ define <vscale x 8 x i16> @reassociate_ors_nxv8i16(<vscale x 8 x i16> %x0, <vsca
 declare double @bar()
 
 define double @reassociate_adds_from_calls() {
-; CHECK-STD-LABEL: reassociate_adds_from_calls:
-; CHECK-STD:       // %bb.0:
-; CHECK-STD-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-STD-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-STD-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-STD-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-STD-NEXT:    .cfi_offset w30, -8
-; CHECK-STD-NEXT:    .cfi_offset b8, -16
-; CHECK-STD-NEXT:    .cfi_offset b9, -24
-; CHECK-STD-NEXT:    .cfi_offset b10, -32
-; CHECK-STD-NEXT:    bl bar
-; CHECK-STD-NEXT:    fmov d8, d0
-; CHECK-STD-NEXT:    bl bar
-; CHECK-STD-NEXT:    fmov d9, d0
-; CHECK-STD-NEXT:    bl bar
-; CHECK-STD-NEXT:    fmov d10, d0
-; CHECK-STD-NEXT:    bl bar
-; CHECK-STD-NEXT:    fadd d1, d8, d9
-; CHECK-STD-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-STD-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
-; CHECK-STD-NEXT:    fadd d1, d1, d10
-; CHECK-STD-NEXT:    fadd d0, d1, d0
-; CHECK-STD-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
-; CHECK-STD-NEXT:    ret
-;
-; CHECK-UNSAFE-LABEL: reassociate_adds_from_calls:
-; CHECK-UNSAFE:       // %bb.0:
-; CHECK-UNSAFE-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-UNSAFE-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
-; CHECK-UNSAFE-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; CHECK-UNSAFE-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-UNSAFE-NEXT:    .cfi_offset w30, -8
-; CHECK-UNSAFE-NEXT:    .cfi_offset b8, -16
-; CHECK-UNSAFE-NEXT:    .cfi_offset b9, -24
-; CHECK-UNSAFE-NEXT:    .cfi_offset b10, -32
-; CHECK-UNSAFE-NEXT:    bl bar
-; CHECK-UNSAFE-NEXT:    fmov d8, d0
-; CHECK-UNSAFE-NEXT:    bl bar
-; CHECK-UNSAFE-NEXT:    fmov d9, d0
-; CHECK-UNSAFE-NEXT:    bl bar
-; CHECK-UNSAFE-NEXT:    fmov d10, d0
-; CHECK-UNSAFE-NEXT:    bl bar
-; CHECK-UNSAFE-NEXT:    fadd d1, d8, d9
-; CHECK-UNSAFE-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
-; CHECK-UNSAFE-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
-; CHECK-UNSAFE-NEXT:    fadd d0, d10, d0
-; CHECK-UNSAFE-NEXT:    fadd d0, d1, d0
-; CHECK-UNSAFE-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
-; CHECK-UNSAFE-NEXT:    ret
+; CHECK-LABEL: reassociate_adds_from_calls:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    .cfi_offset b9, -24
+; CHECK-NEXT:    .cfi_offset b10, -32
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d8, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d9, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d10, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fadd d1, d8, d9
+; CHECK-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    fadd d1, d1, d10
+; CHECK-NEXT:    fadd d0, d1, d0
+; CHECK-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %x0 = call double @bar()
   %x1 = call double @bar()
   %x2 = call double @bar()
@@ -812,6 +903,41 @@ define double @reassociate_adds_from_calls() {
   ret double %t2
 }
 
+define double @reassociate_adds_from_calls_reassoc() {
+; CHECK-LABEL: reassociate_adds_from_calls_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    .cfi_offset b9, -24
+; CHECK-NEXT:    .cfi_offset b10, -32
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d8, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d9, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d10, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fadd d1, d8, d9
+; CHECK-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    fadd d0, d10, d0
+; CHECK-NEXT:    fadd d0, d1, d0
+; CHECK-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %x0 = call reassoc nsz double @bar()
+  %x1 = call reassoc nsz double @bar()
+  %x2 = call reassoc nsz double @bar()
+  %x3 = call reassoc nsz double @bar()
+  %t0 = fadd reassoc nsz double %x0, %x1
+  %t1 = fadd reassoc nsz double %t0, %x2
+  %t2 = fadd reassoc nsz double %t1, %x3
+  ret double %t2
+}
+
 define double @already_reassociated() {
 ; CHECK-LABEL: already_reassociated:
 ; CHECK:       // %bb.0:
@@ -846,3 +972,38 @@ define double @already_reassociated() {
   %t2 = fadd double %t0, %t1
   ret double %t2
 }
+
+define double @already_reassociated_reassoc() {
+; CHECK-LABEL: already_reassociated_reassoc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset b8, -16
+; CHECK-NEXT:    .cfi_offset b9, -24
+; CHECK-NEXT:    .cfi_offset b10, -32
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d8, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d9, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fmov d10, d0
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    fadd d1, d8, d9
+; CHECK-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    fadd d0, d10, d0
+; CHECK-NEXT:    fadd d0, d1, d0
+; CHECK-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %x0 = call reassoc nsz double @bar()
+  %x1 = call reassoc nsz double @bar()
+  %x2 = call reassoc nsz double @bar()
+  %x3 = call reassoc nsz double @bar()
+  %t0 = fadd reassoc nsz double %x0, %x1
+  %t1 = fadd reassoc nsz double %x2, %x3
+  %t2 = fadd reassoc nsz double %t0, %t1
+  ret double %t2
+}
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.mir b/llvm/test/CodeGen/AArch64/machine-combiner.mir
index b967aaa..a0e1280 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -enable-unsafe-fp-math \
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 \
 # RUN:     -run-pass machine-combiner -machine-combiner-inc-threshold=0 \
 # RUN:     -machine-combiner-verify-pattern-order=true -verify-machineinstrs  -o - %s | FileCheck %s
 ---
@@ -36,8 +36,8 @@ body:             |
     %6 = ADDWrr %3, killed %5
     %7 = SCVTFUWDri killed %6, implicit $fpcr
     ; CHECK: FMADDDrrr %7, %7, %0, implicit $fpcr
-    %8 = FMULDrr %7, %7, implicit $fpcr
-    %9 = FADDDrr %0, killed %8, implicit $fpcr
+    %8 = contract FMULDrr %7, %7, implicit $fpcr
+    %9 = contract FADDDrr %0, killed %8, implicit $fpcr
     $d0 = COPY %9
     RET_ReallyLR implicit $d0
 
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 9443004..4f0c408 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -6810,200 +6810,195 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
 ; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    .cfi_offset w29, -16
-; CHECK-SD-NEXT:    ldr b0, [sp, #208]
+; CHECK-SD-NEXT:    ldr b5, [sp, #208]
 ; CHECK-SD-NEXT:    add x8, sp, #216
-; CHECK-SD-NEXT:    add x9, sp, #272
-; CHECK-SD-NEXT:    ldr b2, [sp, #80]
+; CHECK-SD-NEXT:    fmov s0, w0
 ; CHECK-SD-NEXT:    ldr b4, [sp, #976]
-; CHECK-SD-NEXT:    ldr b6, [sp, #720]
-; CHECK-SD-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #984
+; CHECK-SD-NEXT:    add x12, sp, #328
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #224
-; CHECK-SD-NEXT:    fmov s16, w0
-; CHECK-SD-NEXT:    ldr b17, [sp, #848]
-; CHECK-SD-NEXT:    add x10, sp, #24
-; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-SD-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-SD-NEXT:    movi v1.16b, #1
+; CHECK-SD-NEXT:    mov v0.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    add x11, sp, #992
+; CHECK-SD-NEXT:    ldr b6, [sp, #720]
+; CHECK-SD-NEXT:    ldr b7, [sp, #80]
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #232
-; CHECK-SD-NEXT:    mov v16.b[1], w1
-; CHECK-SD-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-SD-NEXT:    add x13, sp, #88
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x11]
+; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #856
+; CHECK-SD-NEXT:    mov v0.b[2], w2
+; CHECK-SD-NEXT:    add x14, sp, #1008
+; CHECK-SD-NEXT:    add x15, sp, #872
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #240
-; CHECK-SD-NEXT:    mov v16.b[2], w2
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    add x16, sp, #888
+; CHECK-SD-NEXT:    add x10, sp, #16
+; CHECK-SD-NEXT:    add x9, sp, #24
+; CHECK-SD-NEXT:    add x11, sp, #40
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #248
-; CHECK-SD-NEXT:    mov v16.b[3], w3
-; CHECK-SD-NEXT:    ld1 { v0.b }[5], [x8]
+; CHECK-SD-NEXT:    mov v0.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #256
-; CHECK-SD-NEXT:    ld1 { v0.b }[6], [x8]
+; CHECK-SD-NEXT:    mov v0.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #264
-; CHECK-SD-NEXT:    mov v16.b[4], w4
-; CHECK-SD-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b1, [x9]
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #272
+; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #280
-; CHECK-SD-NEXT:    add x9, sp, #88
-; CHECK-SD-NEXT:    mov v16.b[5], w5
-; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
+; CHECK-SD-NEXT:    mov v0.b[6], w6
+; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #288
-; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
+; CHECK-SD-NEXT:    mov v0.b[7], w7
+; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #296
-; CHECK-SD-NEXT:    mov v16.b[6], w6
-; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #128
+; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #304
-; CHECK-SD-NEXT:    mov v16.b[7], w7
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #136
+; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #312
-; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #320
-; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #328
-; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #96
-; CHECK-SD-NEXT:    add x9, sp, #144
-; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #104
-; CHECK-SD-NEXT:    zip1 v0.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT:    movi v1.16b, #1
-; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #112
-; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #120
-; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #128
-; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #136
-; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b3, [x9]
+; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #144
+; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #728
+; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #1000
+; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x11]
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #736
+; CHECK-SD-NEXT:    add x11, sp, #920
+; CHECK-SD-NEXT:    sdot v3.4s, v5.16b, v1.16b
+; CHECK-SD-NEXT:    ldr b5, [sp, #848]
+; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #48
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #744
+; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #96
+; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x12]
+; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #864
+; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #1016
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #752
+; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #104
+; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #1024
+; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x14]
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x15]
+; CHECK-SD-NEXT:    add x15, sp, #760
+; CHECK-SD-NEXT:    add x14, sp, #112
+; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #880
+; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x15]
+; CHECK-SD-NEXT:    add x15, sp, #1032
+; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x14]
+; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x13]
+; CHECK-SD-NEXT:    add x14, sp, #768
+; CHECK-SD-NEXT:    add x13, sp, #120
+; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x15]
+; CHECK-SD-NEXT:    add x15, sp, #1040
+; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x14]
+; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #776
+; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x16]
+; CHECK-SD-NEXT:    add x14, sp, #1048
+; CHECK-SD-NEXT:    ld1 { v4.b }[8], [x15]
+; CHECK-SD-NEXT:    add x15, sp, #896
+; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x13]
+; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #784
+; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x15]
+; CHECK-SD-NEXT:    add x13, sp, #1056
+; CHECK-SD-NEXT:    ld1 { v4.b }[9], [x14]
+; CHECK-SD-NEXT:    add x14, sp, #904
+; CHECK-SD-NEXT:    ld1 { v6.b }[8], [x10]
+; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #792
+; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x14]
+; CHECK-SD-NEXT:    add x10, sp, #1064
+; CHECK-SD-NEXT:    ld1 { v4.b }[10], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #912
+; CHECK-SD-NEXT:    ld1 { v6.b }[9], [x9]
+; CHECK-SD-NEXT:    ld1 { v7.b }[8], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #800
+; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x13]
 ; CHECK-SD-NEXT:    add x8, sp, #152
-; CHECK-SD-NEXT:    add x9, sp, #984
-; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #160
-; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #168
-; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #176
-; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #184
-; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #192
-; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #200
-; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #992
-; CHECK-SD-NEXT:    add x9, sp, #1040
-; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1000
-; CHECK-SD-NEXT:    zip1 v2.2d, v2.2d, v3.2d
-; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1008
-; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1016
-; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1024
-; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1032
-; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b5, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #1048
-; CHECK-SD-NEXT:    add x9, sp, #728
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1056
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1064
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1072
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1080
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1088
-; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1096
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #736
-; CHECK-SD-NEXT:    add x9, sp, #784
-; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #744
-; CHECK-SD-NEXT:    zip1 v4.2d, v4.2d, v5.2d
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #752
-; CHECK-SD-NEXT:    sdot v19.4s, v4.16b, v1.16b
-; CHECK-SD-NEXT:    sdot v5.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #760
-; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #768
-; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #776
-; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b7, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #792
-; CHECK-SD-NEXT:    add x9, sp, #856
-; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #800
-; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #808
-; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x8]
+; CHECK-SD-NEXT:    ld1 { v4.b }[11], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #1072
+; CHECK-SD-NEXT:    ld1 { v6.b }[10], [x9]
+; CHECK-SD-NEXT:    ld1 { v7.b }[9], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #808
+; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x11]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    ld1 { v4.b }[12], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #160
+; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x8]
+; CHECK-SD-NEXT:    ld1 { v6.b }[11], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #928
+; CHECK-SD-NEXT:    ld1 { v7.b }[10], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #1080
+; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x9]
 ; CHECK-SD-NEXT:    add x8, sp, #816
-; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #824
-; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #832
-; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #840
-; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v17.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #864
-; CHECK-SD-NEXT:    add x9, sp, #16
-; CHECK-SD-NEXT:    ld1 { v16.b }[8], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #912
-; CHECK-SD-NEXT:    ld1 { v17.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #872
-; CHECK-SD-NEXT:    zip1 v0.2d, v6.2d, v7.2d
-; CHECK-SD-NEXT:    ld1 { v16.b }[9], [x10]
-; CHECK-SD-NEXT:    ld1 { v17.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #880
-; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    ld1 { v17.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #888
-; CHECK-SD-NEXT:    ld1 { v17.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #896
-; CHECK-SD-NEXT:    ld1 { v17.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #904
-; CHECK-SD-NEXT:    ld1 { v17.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b18, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #920
-; CHECK-SD-NEXT:    ld1 { v18.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #32
-; CHECK-SD-NEXT:    ld1 { v16.b }[10], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #928
-; CHECK-SD-NEXT:    ld1 { v18.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #40
-; CHECK-SD-NEXT:    ld1 { v16.b }[11], [x8]
+; CHECK-SD-NEXT:    ld1 { v4.b }[13], [x10]
+; CHECK-SD-NEXT:    add x9, sp, #168
+; CHECK-SD-NEXT:    add x10, sp, #176
+; CHECK-SD-NEXT:    ld1 { v6.b }[12], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #936
-; CHECK-SD-NEXT:    ld1 { v18.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #48
-; CHECK-SD-NEXT:    ld1 { v16.b }[12], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #944
-; CHECK-SD-NEXT:    ld1 { v18.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #56
-; CHECK-SD-NEXT:    ld1 { v16.b }[13], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #952
-; CHECK-SD-NEXT:    ld1 { v18.b }[5], [x8]
+; CHECK-SD-NEXT:    ld1 { v7.b }[11], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #1088
+; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #64
-; CHECK-SD-NEXT:    ld1 { v16.b }[14], [x8]
+; CHECK-SD-NEXT:    ld1 { v4.b }[14], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #824
+; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x8]
+; CHECK-SD-NEXT:    ld1 { v6.b }[13], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #944
+; CHECK-SD-NEXT:    ld1 { v7.b }[12], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #1096
+; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x9]
+; CHECK-SD-NEXT:    add x8, sp, #832
+; CHECK-SD-NEXT:    ld1 { v4.b }[15], [x10]
+; CHECK-SD-NEXT:    add x9, sp, #184
+; CHECK-SD-NEXT:    add x10, sp, #72
+; CHECK-SD-NEXT:    ld1 { v6.b }[14], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #952
+; CHECK-SD-NEXT:    ld1 { v7.b }[13], [x9]
+; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #840
+; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x10]
+; CHECK-SD-NEXT:    sdot v2.4s, v4.16b, v1.16b
+; CHECK-SD-NEXT:    add x9, sp, #192
+; CHECK-SD-NEXT:    ld1 { v6.b }[15], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #960
-; CHECK-SD-NEXT:    ld1 { v18.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #72
-; CHECK-SD-NEXT:    ld1 { v16.b }[15], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #968
-; CHECK-SD-NEXT:    ld1 { v18.b }[7], [x8]
-; CHECK-SD-NEXT:    sdot v5.4s, v16.16b, v1.16b
-; CHECK-SD-NEXT:    zip1 v0.2d, v17.2d, v18.2d
-; CHECK-SD-NEXT:    sdot v5.4s, v2.16b, v1.16b
-; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    add v0.4s, v5.4s, v19.4s
+; CHECK-SD-NEXT:    ld1 { v7.b }[14], [x9]
+; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
+; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    add x8, sp, #200
+; CHECK-SD-NEXT:    add x9, sp, #968
+; CHECK-SD-NEXT:    sdot v2.4s, v6.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v7.b }[15], [x8]
+; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x9]
+; CHECK-SD-NEXT:    sdot v3.4s, v7.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v2.4s, v5.16b, v1.16b
+; CHECK-SD-NEXT:    add v0.4s, v3.4s, v2.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index f7a87ae..f8ba150 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -683,43 +683,41 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v17f32:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BE-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-BE-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-BE-NEXT:    add x8, sp, #12
-; CHECK-BE-NEXT:    add x9, sp, #20
+; CHECK-BE-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-BE-NEXT:    ldr s16, [sp, #36]
-; CHECK-BE-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-BE-NEXT:    ldr s1, [sp, #4]
+; CHECK-BE-NEXT:    // kill: def $s5 killed $s5 def $q5
+; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-BE-NEXT:    ldr s17, [sp, #4]
+; CHECK-BE-NEXT:    add x8, sp, #44
 ; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-BE-NEXT:    add x10, sp, #52
+; CHECK-BE-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-BE-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-BE-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-BE-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-BE-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-BE-NEXT:    ld1 { v1.s }[1], [x8]
-; CHECK-BE-NEXT:    ldr s5, [x9]
-; CHECK-BE-NEXT:    add x8, sp, #28
-; CHECK-BE-NEXT:    add x9, sp, #44
-; CHECK-BE-NEXT:    ld1 { v5.s }[1], [x8]
-; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x9]
-; CHECK-BE-NEXT:    ldr s17, [x10]
-; CHECK-BE-NEXT:    add x8, sp, #60
+; CHECK-BE-NEXT:    ldr s1, [sp, #68]
+; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x8]
+; CHECK-BE-NEXT:    add x8, sp, #12
+; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
+; CHECK-BE-NEXT:    add x8, sp, #52
+; CHECK-BE-NEXT:    str s1, [x0, #64]
+; CHECK-BE-NEXT:    ld1 { v16.s }[2], [x8]
+; CHECK-BE-NEXT:    add x8, sp, #20
 ; CHECK-BE-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-BE-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
-; CHECK-BE-NEXT:    ldr s2, [sp, #68]
-; CHECK-BE-NEXT:    add x8, x0, #32
-; CHECK-BE-NEXT:    zip1 v1.2d, v1.2d, v5.2d
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    str s2, [x0, #64]
-; CHECK-BE-NEXT:    zip1 v5.2d, v16.2d, v17.2d
+; CHECK-BE-NEXT:    ld1 { v17.s }[2], [x8]
+; CHECK-BE-NEXT:    add x8, sp, #60
+; CHECK-BE-NEXT:    ld1 { v16.s }[3], [x8]
+; CHECK-BE-NEXT:    add x8, sp, #28
+; CHECK-BE-NEXT:    ld1 { v17.s }[3], [x8]
 ; CHECK-BE-NEXT:    mov v4.s[3], v7.s[0]
+; CHECK-BE-NEXT:    add x8, x0, #48
 ; CHECK-BE-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v16.4s }, [x8]
+; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    st1 { v17.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #16
-; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x0]
 ; CHECK-BE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll b/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll
index 05f4fb1..a6cb712 100644
--- a/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll
+++ b/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll
@@ -40,3 +40,10 @@ define void @test_optsize() optsize {
 
 ; CHECK-LABEL: test_optsize
 ; CHECK-NEXT: .p2align 2
+
+define void @test_minsize() minsize {
+  ret void
+}
+
+; CHECK-LABEL: test_minsize
+; CHECK-NEXT: .p2align 2
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-isel.ll b/llvm/test/CodeGen/AArch64/ptrauth-isel.ll
new file mode 100644
index 0000000..7011b94
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-isel.ll
@@ -0,0 +1,269 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple arm64e-apple-darwin             -verify-machineinstrs -stop-after=finalize-isel -global-isel=0 \
+; RUN:     | FileCheck %s --check-prefixes=DAGISEL
+; RUN: llc < %s -mtriple arm64e-apple-darwin             -verify-machineinstrs -stop-after=finalize-isel -global-isel=1 -global-isel-abort=1 \
+; RUN:     | FileCheck %s --check-prefixes=GISEL
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -verify-machineinstrs -stop-after=finalize-isel -global-isel=0 \
+; RUN:     | FileCheck %s --check-prefixes=DAGISEL
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -verify-machineinstrs -stop-after=finalize-isel -global-isel=1 -global-isel-abort=1 \
+; RUN:     | FileCheck %s --check-prefixes=GISEL
+
+; Check MIR produced by the instruction selector to validate properties that
+; cannot be reliably tested by only inspecting the final asm output.
+
+@discvar = dso_local global i64 0
+
+; Make sure the components of blend(addr, imm) and integer constants are
+; recognized and passed to PAC pseudo via separate operands to prevent
+; substitution of the immediate modifier.
+;
+; MIR output of the instruction selector is inspected, as it is hard to reliably
+; distinguish MOVKXi immediately followed by a pseudo from a standalone pseudo
+; instruction carrying address and immediate modifiers in its separate operands
+; by only observing the final asm output.
+
+define i64 @small_imm_disc_optimized(i64 %addr) {
+  ; DAGISEL-LABEL: name: small_imm_disc_optimized
+  ; DAGISEL: bb.0.entry:
+  ; DAGISEL-NEXT:   liveins: $x0
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; DAGISEL-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 42
+  ; DAGISEL-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64noip = SUBREG_TO_REG 0, killed [[MOVi32imm]], %subreg.sub_32
+  ; DAGISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, killed $noreg, implicit-def dead $x16, implicit-def dead $x17
+  ; DAGISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; DAGISEL-NEXT:   RET_ReallyLR implicit $x0
+  ;
+  ; GISEL-LABEL: name: small_imm_disc_optimized
+  ; GISEL: bb.1.entry:
+  ; GISEL-NEXT:   liveins: $x0
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; GISEL-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 42
+  ; GISEL-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64noip = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32
+  ; GISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, $noreg, implicit-def dead $x16, implicit-def dead $x17
+  ; GISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; GISEL-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %signed = call i64 @llvm.ptrauth.sign(i64 %addr, i32 2, i64 42)
+  ret i64 %signed
+}
+
+; Without optimization, MOVi64imm may be used for small i64 constants as well.
+define i64 @small_imm_disc_non_optimized(i64 %addr) noinline optnone {
+  ; DAGISEL-LABEL: name: small_imm_disc_non_optimized
+  ; DAGISEL: bb.0.entry:
+  ; DAGISEL-NEXT:   liveins: $x0
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY killed [[COPY]]
+  ; DAGISEL-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 42
+  ; DAGISEL-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64noip = SUBREG_TO_REG 0, killed [[MOVi32imm]], %subreg.sub_32
+  ; DAGISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY1]], 2, 42, killed $noreg, implicit-def dead $x16, implicit-def dead $x17
+  ; DAGISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64all = COPY [[PAC]]
+  ; DAGISEL-NEXT:   $x0 = COPY [[COPY2]]
+  ; DAGISEL-NEXT:   RET_ReallyLR implicit $x0
+  ;
+  ; GISEL-LABEL: name: small_imm_disc_non_optimized
+  ; GISEL: bb.1.entry:
+  ; GISEL-NEXT:   liveins: $x0
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; GISEL-NEXT:   [[MOVi64imm:%[0-9]+]]:gpr64noip = MOVi64imm 42
+  ; GISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, $noreg, implicit-def dead $x16, implicit-def dead $x17
+  ; GISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; GISEL-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %signed = call i64 @llvm.ptrauth.sign(i64 %addr, i32 2, i64 42)
+  ret i64 %signed
+}
+
+define i64 @large_imm_disc_wreg(i64 %addr) {
+  ; DAGISEL-LABEL: name: large_imm_disc_wreg
+  ; DAGISEL: bb.0.entry:
+  ; DAGISEL-NEXT:   liveins: $x0
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; DAGISEL-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 12345678
+  ; DAGISEL-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64noip = SUBREG_TO_REG 0, killed [[MOVi32imm]], %subreg.sub_32
+  ; DAGISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, killed [[SUBREG_TO_REG]], implicit-def dead $x16, implicit-def dead $x17
+  ; DAGISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; DAGISEL-NEXT:   RET_ReallyLR implicit $x0
+  ;
+  ; GISEL-LABEL: name: large_imm_disc_wreg
+  ; GISEL: bb.1.entry:
+  ; GISEL-NEXT:   liveins: $x0
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; GISEL-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 12345678
+  ; GISEL-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64noip = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32
+  ; GISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, [[SUBREG_TO_REG]], implicit-def dead $x16, implicit-def dead $x17
+  ; GISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; GISEL-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %signed = call i64 @llvm.ptrauth.sign(i64 %addr, i32 2, i64 12345678)
+  ret i64 %signed
+}
+
+define i64 @large_imm_disc_xreg(i64 %addr) {
+  ; DAGISEL-LABEL: name: large_imm_disc_xreg
+  ; DAGISEL: bb.0.entry:
+  ; DAGISEL-NEXT:   liveins: $x0
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; DAGISEL-NEXT:   [[MOVi64imm:%[0-9]+]]:gpr64noip = MOVi64imm 123456789012345
+  ; DAGISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, killed [[MOVi64imm]], implicit-def dead $x16, implicit-def dead $x17
+  ; DAGISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; DAGISEL-NEXT:   RET_ReallyLR implicit $x0
+  ;
+  ; GISEL-LABEL: name: large_imm_disc_xreg
+  ; GISEL: bb.1.entry:
+  ; GISEL-NEXT:   liveins: $x0
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; GISEL-NEXT:   [[MOVi64imm:%[0-9]+]]:gpr64noip = MOVi64imm 123456789012345
+  ; GISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, [[MOVi64imm]], implicit-def dead $x16, implicit-def dead $x17
+  ; GISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; GISEL-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %signed = call i64 @llvm.ptrauth.sign(i64 %addr, i32 2, i64 123456789012345)
+  ret i64 %signed
+}
+
+; Make sure blend() is lowered as expected when optimization is disabled.
+define i64 @blended_disc_non_optimized(i64 %addr, i64 %addrdisc) noinline optnone {
+  ; DAGISEL-LABEL: name: blended_disc_non_optimized
+  ; DAGISEL: bb.0.entry:
+  ; DAGISEL-NEXT:   liveins: $x0, $x1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
+  ; DAGISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64 = COPY killed [[COPY1]]
+  ; DAGISEL-NEXT:   [[COPY3:%[0-9]+]]:gpr64 = COPY killed [[COPY]]
+  ; DAGISEL-NEXT:   [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[COPY3]], 42, 48
+  ; DAGISEL-NEXT:   [[COPY4:%[0-9]+]]:gpr64noip = COPY [[MOVKXi]]
+  ; DAGISEL-NEXT:   [[COPY5:%[0-9]+]]:gpr64noip = COPY [[COPY3]]
+  ; DAGISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY2]], 2, 42, [[COPY5]], implicit-def dead $x16, implicit-def dead $x17
+  ; DAGISEL-NEXT:   [[COPY6:%[0-9]+]]:gpr64all = COPY [[PAC]]
+  ; DAGISEL-NEXT:   $x0 = COPY [[COPY6]]
+  ; DAGISEL-NEXT:   RET_ReallyLR implicit $x0
+  ;
+  ; GISEL-LABEL: name: blended_disc_non_optimized
+  ; GISEL: bb.1.entry:
+  ; GISEL-NEXT:   liveins: $x0, $x1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; GISEL-NEXT:   [[MOVKXi:%[0-9]+]]:gpr64noip = MOVKXi [[COPY1]], 42, 48
+  ; GISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64noip = COPY [[COPY1]]
+  ; GISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, [[COPY2]], implicit-def dead $x16, implicit-def dead $x17
+  ; GISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; GISEL-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %disc = call i64 @llvm.ptrauth.blend(i64 %addrdisc, i64 42)
+  %signed = call i64 @llvm.ptrauth.sign(i64 %addr, i32 2, i64 %disc)
+  ret i64 %signed
+}
+
+define i64 @blend_and_sign_same_bb(i64 %addr) {
+  ; DAGISEL-LABEL: name: blend_and_sign_same_bb
+  ; DAGISEL: bb.0.entry:
+  ; DAGISEL-NEXT:   liveins: $x0
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; DAGISEL-NEXT:   [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) @discvar
+  ; DAGISEL-NEXT:   [[LDRXui:%[0-9]+]]:gpr64 = LDRXui killed [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) @discvar :: (dereferenceable load (s64) from @discvar)
+  ; DAGISEL-NEXT:   [[MOVKXi:%[0-9]+]]:gpr64noip = MOVKXi [[LDRXui]], 42, 48
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64noip = COPY [[LDRXui]]
+  ; DAGISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, killed [[COPY1]], implicit-def dead $x16, implicit-def dead $x17
+  ; DAGISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; DAGISEL-NEXT:   RET_ReallyLR implicit $x0
+  ;
+  ; GISEL-LABEL: name: blend_and_sign_same_bb
+  ; GISEL: bb.1.entry:
+  ; GISEL-NEXT:   liveins: $x0
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; GISEL-NEXT:   [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) @discvar
+  ; GISEL-NEXT:   [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) @discvar :: (dereferenceable load (s64) from @discvar)
+  ; GISEL-NEXT:   [[MOVKXi:%[0-9]+]]:gpr64noip = MOVKXi [[LDRXui]], 42, 48
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64noip = COPY [[LDRXui]]
+  ; GISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, [[COPY1]], implicit-def dead $x16, implicit-def dead $x17
+  ; GISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; GISEL-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %addrdisc = load i64, ptr @discvar
+  %disc = call i64 @llvm.ptrauth.blend(i64 %addrdisc, i64 42)
+  %signed = call i64 @llvm.ptrauth.sign(i64 %addr, i32 2, i64 %disc)
+  ret i64 %signed
+}
+
+; In the below test cases both %addrdisc and %disc are computed (i.e. they are
+; neither global addresses, nor function arguments) in a different basic block,
+; making them harder to express via ISD::PtrAuthGlobalAddress.
+
+define i64 @blend_and_sign_different_bbs(i64 %addr, i64 %cond) {
+  ; DAGISEL-LABEL: name: blend_and_sign_different_bbs
+  ; DAGISEL: bb.0.entry:
+  ; DAGISEL-NEXT:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; DAGISEL-NEXT:   liveins: $x0, $x1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+  ; DAGISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
+  ; DAGISEL-NEXT:   [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) @discvar
+  ; DAGISEL-NEXT:   [[LDRXui:%[0-9]+]]:gpr64 = LDRXui killed [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) @discvar :: (dereferenceable load (s64) from @discvar)
+  ; DAGISEL-NEXT:   [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[LDRXui]], 42, 48
+  ; DAGISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64noip = COPY [[MOVKXi]]
+  ; DAGISEL-NEXT:   CBZX [[COPY]], %bb.2
+  ; DAGISEL-NEXT:   B %bb.1
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT: bb.1.next:
+  ; DAGISEL-NEXT:   successors: %bb.2(0x80000000)
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT:   [[COPY3:%[0-9]+]]:gpr64common = COPY [[COPY2]]
+  ; DAGISEL-NEXT:   INLINEASM &nop, 1 /* sideeffect attdialect */, 3866633 /* reguse:GPR64common */, [[COPY3]]
+  ; DAGISEL-NEXT: {{  $}}
+  ; DAGISEL-NEXT: bb.2.exit:
+  ; DAGISEL-NEXT:   [[COPY4:%[0-9]+]]:gpr64noip = COPY [[LDRXui]]
+  ; DAGISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY1]], 2, 42, [[COPY4]], implicit-def dead $x16, implicit-def dead $x17
+  ; DAGISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; DAGISEL-NEXT:   RET_ReallyLR implicit $x0
+  ;
+  ; GISEL-LABEL: name: blend_and_sign_different_bbs
+  ; GISEL: bb.1.entry:
+  ; GISEL-NEXT:   successors: %bb.2(0x50000000), %bb.3(0x30000000)
+  ; GISEL-NEXT:   liveins: $x0, $x1
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; GISEL-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+  ; GISEL-NEXT:   [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) @discvar
+  ; GISEL-NEXT:   [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) @discvar :: (dereferenceable load (s64) from @discvar)
+  ; GISEL-NEXT:   [[MOVKXi:%[0-9]+]]:gpr64noip = MOVKXi [[LDRXui]], 42, 48
+  ; GISEL-NEXT:   CBZX [[COPY1]], %bb.3
+  ; GISEL-NEXT:   B %bb.2
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT: bb.2.next:
+  ; GISEL-NEXT:   successors: %bb.3(0x80000000)
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT:   [[COPY2:%[0-9]+]]:gpr64common = COPY [[MOVKXi]]
+  ; GISEL-NEXT:   INLINEASM &nop, 1 /* sideeffect attdialect */, 3866633 /* reguse:GPR64common */, [[COPY2]]
+  ; GISEL-NEXT: {{  $}}
+  ; GISEL-NEXT: bb.3.exit:
+  ; GISEL-NEXT:   [[COPY3:%[0-9]+]]:gpr64noip = COPY [[LDRXui]]
+  ; GISEL-NEXT:   [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, [[COPY3]], implicit-def dead $x16, implicit-def dead $x17
+  ; GISEL-NEXT:   $x0 = COPY [[PAC]]
+  ; GISEL-NEXT:   RET_ReallyLR implicit $x0
+entry:
+  %addrdisc = load i64, ptr @discvar
+  %disc = call i64 @llvm.ptrauth.blend(i64 %addrdisc, i64 42)
+  %cond.b = icmp ne i64 %cond, 0
+  br i1 %cond.b, label %next, label %exit
+
+next:
+  call void asm sideeffect "nop", "r"(i64 %disc)
+  br label %exit
+
+exit:
+  %signed = call i64 @llvm.ptrauth.sign(i64 %addr, i32 2, i64 %disc)
+  ret i64 %signed
+}
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-isel.mir b/llvm/test/CodeGen/AArch64/ptrauth-isel.mir
new file mode 100644
index 0000000..1a15588
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-isel.mir
@@ -0,0 +1,205 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -o - %s -mtriple arm64e-apple-darwin             -verify-machineinstrs \
+# RUN:     -stop-after=finalize-isel -start-before=finalize-isel | FileCheck %s
+# RUN: llc -o - %s -mtriple aarch64-linux-gnu -mattr=+pauth -verify-machineinstrs \
+# RUN:     -stop-after=finalize-isel -start-before=finalize-isel | FileCheck %s
+
+# This MIR-based test contains several test cases that are hard to implement
+# via an LLVM IR input. Most other test cases are in ptrauth-isel.ll file.
+
+--- |
+  @globalvar = dso_local global i64 0
+
+  define i64 @movk_correct_blend(i64 %a, i64 %b) {
+  entry:
+    ret i64 0
+  }
+
+  define i64 @movk_wrong_shift_amount(i64 %a, i64 %b) {
+  entry:
+    ret i64 0
+  }
+
+  define i64 @movk_non_immediate_operand(i64 %a, i64 %b) {
+  entry:
+    ret i64 0
+  }
+
+  define i64 @movi64imm_immediate_operand(i64 %a) {
+  entry:
+    ret i64 0
+  }
+
+  define i64 @movi64imm_non_immediate_operand(i64 %a) {
+  entry:
+    ret i64 0
+  }
+
+  define i64 @movi32imm_immediate_operand(i64 %a) {
+  entry:
+    ret i64 0
+  }
+
+  define i64 @movi32imm_non_immediate_operand(i64 %a) {
+  entry:
+    ret i64 0
+  }
+...
+---
+name:            movk_correct_blend
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: movk_correct_blend
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[MOVKXi:%[0-9]+]]:gpr64noip = MOVKXi [[COPY1]], 42, 48
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64noip = COPY [[COPY1]]
+    ; CHECK-NEXT: [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, killed [[COPY2]], implicit-def dead $x16, implicit-def dead $x17
+    ; CHECK-NEXT: $x0 = COPY [[PAC]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:gpr64 = COPY $x0
+    %1:gpr64 = COPY $x1
+    %2:gpr64noip = MOVKXi %1, 42, 48
+    %3:gpr64 = PAC %0, 2, 0, killed %2, implicit-def dead $x16, implicit-def dead $x17
+    $x0 = COPY %3
+    RET_ReallyLR implicit $x0
+...
+---
+name:            movk_wrong_shift_amount
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: movk_wrong_shift_amount
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[MOVKXi:%[0-9]+]]:gpr64noip = MOVKXi [[COPY1]], 42, 0
+    ; CHECK-NEXT: [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, killed [[MOVKXi]], implicit-def dead $x16, implicit-def dead $x17
+    ; CHECK-NEXT: $x0 = COPY [[PAC]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:gpr64 = COPY $x0
+    %1:gpr64 = COPY $x1
+    %2:gpr64noip = MOVKXi %1, 42, 0
+    %3:gpr64 = PAC %0, 2, 0, killed %2, implicit-def dead $x16, implicit-def dead $x17
+    $x0 = COPY %3
+    RET_ReallyLR implicit $x0
+...
+---
+name:            movk_non_immediate_operand
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: movk_non_immediate_operand
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
+    ; CHECK-NEXT: [[MOVKXi:%[0-9]+]]:gpr64noip = MOVKXi [[COPY1]], target-flags(aarch64-pageoff, aarch64-nc) @globalvar, 48
+    ; CHECK-NEXT: [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, killed [[MOVKXi]], implicit-def dead $x16, implicit-def dead $x17
+    ; CHECK-NEXT: $x0 = COPY [[PAC]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:gpr64 = COPY $x0
+    %1:gpr64 = COPY $x1
+    %2:gpr64noip = MOVKXi %1, target-flags(aarch64-pageoff, aarch64-nc) @globalvar, 48
+    %3:gpr64 = PAC %0, 2, 0, killed %2, implicit-def dead $x16, implicit-def dead $x17
+    $x0 = COPY %3
+    RET_ReallyLR implicit $x0
+...
+---
+name:            movi64imm_immediate_operand
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: movi64imm_immediate_operand
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64noip = MOVi64imm 42
+    ; CHECK-NEXT: [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, killed $noreg, implicit-def dead $x16, implicit-def dead $x17
+    ; CHECK-NEXT: $x0 = COPY [[PAC]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:gpr64 = COPY $x0
+    %1:gpr64noip = MOVi64imm 42
+    %2:gpr64 = PAC %0, 2, 0, killed %1, implicit-def dead $x16, implicit-def dead $x17
+    $x0 = COPY %2
+    RET_ReallyLR implicit $x0
+...
+---
+name:            movi64imm_non_immediate_operand
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: movi64imm_non_immediate_operand
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64noip = MOVi64imm target-flags(aarch64-pageoff, aarch64-nc) @globalvar
+    ; CHECK-NEXT: [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, killed [[MOVi64imm]], implicit-def dead $x16, implicit-def dead $x17
+    ; CHECK-NEXT: $x0 = COPY [[PAC]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:gpr64 = COPY $x0
+    %1:gpr64noip = MOVi64imm target-flags(aarch64-pageoff, aarch64-nc) @globalvar
+    %2:gpr64 = PAC %0, 2, 0, killed %1, implicit-def dead $x16, implicit-def dead $x17
+    $x0 = COPY %2
+    RET_ReallyLR implicit $x0
+...
+---
+name:            movi32imm_immediate_operand
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: movi32imm_immediate_operand
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 42
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64noip = SUBREG_TO_REG 0, killed [[MOVi32imm]], %subreg.sub_32
+    ; CHECK-NEXT: [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 42, killed $noreg, implicit-def dead $x16, implicit-def dead $x17
+    ; CHECK-NEXT: $x0 = COPY [[PAC]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:gpr64 = COPY $x0
+    %1:gpr32 = MOVi32imm 42
+    %2:gpr64noip = SUBREG_TO_REG 0, killed %1, %subreg.sub_32
+    %3:gpr64 = PAC %0, 2, 0, killed %2, implicit-def dead $x16, implicit-def dead $x17
+    $x0 = COPY %3
+    RET_ReallyLR implicit $x0
+...
+---
+name:            movi32imm_non_immediate_operand
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: movi32imm_non_immediate_operand
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm target-flags(aarch64-pageoff, aarch64-nc) @globalvar
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64noip = SUBREG_TO_REG 0, killed [[MOVi32imm]], %subreg.sub_32
+    ; CHECK-NEXT: [[PAC:%[0-9]+]]:gpr64 = PAC [[COPY]], 2, 0, killed [[SUBREG_TO_REG]], implicit-def dead $x16, implicit-def dead $x17
+    ; CHECK-NEXT: $x0 = COPY [[PAC]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:gpr64 = COPY $x0
+    %1:gpr32 = MOVi32imm target-flags(aarch64-pageoff, aarch64-nc) @globalvar
+    %2:gpr64noip = SUBREG_TO_REG 0, killed %1, %subreg.sub_32
+    %3:gpr64 = PAC %0, 2, 0, killed %2, implicit-def dead $x16, implicit-def dead $x17
+    $x0 = COPY %3
+    RET_ReallyLR implicit $x0
+...
diff --git a/llvm/test/CodeGen/AArch64/sched-past-vector-ldst.ll b/llvm/test/CodeGen/AArch64/sched-past-vector-ldst.ll
index cd53833..fc5012c 100644
--- a/llvm/test/CodeGen/AArch64/sched-past-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/sched-past-vector-ldst.ll
@@ -23,21 +23,21 @@ entry:
   %scevgep = getelementptr %Struct, ptr %this, i64 0, i32 2, i64 8, i32 0
   %vec1 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %scevgep)
   %ev1 = extractvalue { <4 x float>, <4 x float> } %vec1, 1
-  %fm1 = fmul <4 x float> %f, %ev1
-  %av1 = fadd <4 x float> %f, %fm1
+  %fm1 = fmul contract <4 x float> %f, %ev1
+  %av1 = fadd contract <4 x float> %f, %fm1
   %ev2 = extractvalue { <4 x float>, <4 x float> } %vec1, 0
-  %fm2 = fmul <4 x float> %f, %ev2
-  %av2 = fadd <4 x float> %f, %fm2
+  %fm2 = fmul contract <4 x float> %f, %ev2
+  %av2 = fadd contract <4 x float> %f, %fm2
   %scevgep2 = getelementptr %Struct, ptr %this, i64 0, i32 3, i64 8, i32 0
   tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %av2, <4 x float> %av1, ptr %scevgep2)
   %scevgep3 = getelementptr %Struct, ptr %this, i64 0, i32 2, i64 12, i32 0
   %vec2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %scevgep3)
   %ev3 = extractvalue { <4 x float>, <4 x float> } %vec2, 1
-  %fm3 = fmul <4 x float> %f, %ev3
-  %av3 = fadd <4 x float> %f, %fm3
+  %fm3 = fmul contract <4 x float> %f, %ev3
+  %av3 = fadd contract <4 x float> %f, %fm3
   %ev4 = extractvalue { <4 x float>, <4 x float> } %vec2, 0
-  %fm4 = fmul <4 x float> %f, %ev4
-  %av4 = fadd <4 x float> %f, %fm4
+  %fm4 = fmul contract <4 x float> %f, %ev4
+  %av4 = fadd contract <4 x float> %f, %fm4
   %scevgep4 = getelementptr %Struct, ptr %this, i64 0, i32 3, i64 12, i32 0
   tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %av4, <4 x float> %av3, ptr %scevgep4)
   ret void
@@ -49,6 +49,6 @@ declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr) #2
 ; Function Attrs: nounwind
 declare void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float>, <4 x float>, ptr nocapture) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
index f73b4bd..e29993d 100644
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -2,15 +2,15 @@
 ; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+neon,-use-reciprocal-square-root | FileCheck %s --check-prefix=FAULT
 ; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+neon,+use-reciprocal-square-root | FileCheck %s
 
-declare float @llvm.sqrt.f32(float) #0
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #0
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
-declare double @llvm.sqrt.f64(double) #0
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #0
-declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #0
+declare float @llvm.sqrt.f32(float)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
+declare double @llvm.sqrt.f64(double)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
 
-define float @fsqrt(float %a) #0 {
+define float @fsqrt(float %a) {
 ; FAULT-LABEL: fsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt s0, s0
@@ -33,7 +33,7 @@ define float @fsqrt(float %a) #0 {
   ret float %1
 }
 
-define float @fsqrt_ieee_denorms(float %a) #1 {
+define float @fsqrt_ieee_denorms(float %a) #0 {
 ; FAULT-LABEL: fsqrt_ieee_denorms:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt s0, s0
@@ -56,7 +56,7 @@ define float @fsqrt_ieee_denorms(float %a) #1 {
   ret float %1
 }
 
-define <2 x float> @f2sqrt(<2 x float> %a) #0 {
+define <2 x float> @f2sqrt(<2 x float> %a) {
 ; FAULT-LABEL: f2sqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.2s, v0.2s
@@ -79,7 +79,7 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 {
   ret <2 x float> %1
 }
 
-define <4 x float> @f4sqrt(<4 x float> %a) #0 {
+define <4 x float> @f4sqrt(<4 x float> %a) {
 ; FAULT-LABEL: f4sqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.4s, v0.4s
@@ -102,7 +102,7 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 {
   ret <4 x float> %1
 }
 
-define <8 x float> @f8sqrt(<8 x float> %a) #0 {
+define <8 x float> @f8sqrt(<8 x float> %a) {
 ; FAULT-LABEL: f8sqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.4s, v0.4s
@@ -136,7 +136,7 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 {
   ret <8 x float> %1
 }
 
-define double @dsqrt(double %a) #0 {
+define double @dsqrt(double %a) {
 ; FAULT-LABEL: dsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt d0, d0
@@ -162,7 +162,7 @@ define double @dsqrt(double %a) #0 {
   ret double %1
 }
 
-define double @dsqrt_ieee_denorms(double %a) #1 {
+define double @dsqrt_ieee_denorms(double %a) #0 {
 ; FAULT-LABEL: dsqrt_ieee_denorms:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt d0, d0
@@ -188,7 +188,7 @@ define double @dsqrt_ieee_denorms(double %a) #1 {
   ret double %1
 }
 
-define <2 x double> @d2sqrt(<2 x double> %a) #0 {
+define <2 x double> @d2sqrt(<2 x double> %a) {
 ; FAULT-LABEL: d2sqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.2d, v0.2d
@@ -214,7 +214,7 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 {
   ret <2 x double> %1
 }
 
-define <4 x double> @d4sqrt(<4 x double> %a) #0 {
+define <4 x double> @d4sqrt(<4 x double> %a) {
 ; FAULT-LABEL: d4sqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.2d, v0.2d
@@ -254,7 +254,7 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 {
   ret <4 x double> %1
 }
 
-define float @frsqrt(float %a) #0 {
+define float @frsqrt(float %a) {
 ; FAULT-LABEL: frsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt s0, s0
@@ -277,7 +277,7 @@ define float @frsqrt(float %a) #0 {
   ret float %2
 }
 
-define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
+define <2 x float> @f2rsqrt(<2 x float> %a) {
 ; FAULT-LABEL: f2rsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.2s, v0.2s
@@ -300,7 +300,7 @@ define <2 x float> @f2rsqrt(<2 x float> %a) #0 {
   ret <2 x float> %2
 }
 
-define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
+define <4 x float> @f4rsqrt(<4 x float> %a) {
 ; FAULT-LABEL: f4rsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.4s, v0.4s
@@ -323,7 +323,7 @@ define <4 x float> @f4rsqrt(<4 x float> %a) #0 {
   ret <4 x float> %2
 }
 
-define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
+define <8 x float> @f8rsqrt(<8 x float> %a) {
 ; FAULT-LABEL: f8rsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.4s, v0.4s
@@ -355,7 +355,7 @@ define <8 x float> @f8rsqrt(<8 x float> %a) #0 {
   ret <8 x float> %2
 }
 
-define double @drsqrt(double %a) #0 {
+define double @drsqrt(double %a) {
 ; FAULT-LABEL: drsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt d0, d0
@@ -381,7 +381,7 @@ define double @drsqrt(double %a) #0 {
   ret double %2
 }
 
-define <2 x double> @d2rsqrt(<2 x double> %a) #0 {
+define <2 x double> @d2rsqrt(<2 x double> %a) {
 ; FAULT-LABEL: d2rsqrt:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt v0.2d, v0.2d
@@ -462,8 +462,8 @@ define double @sqrt_fdiv_common_operand(double %x) nounwind {
 ; CHECK-NEXT:    fmul d1, d1, d2
 ; CHECK-NEXT:    fmul d2, d1, d1
 ; CHECK-NEXT:    frsqrts d2, d0, d2
-; CHECK-NEXT:    fmul d1, d1, d2
 ; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    fmul d0, d0, d2
 ; CHECK-NEXT:    ret
   %sqrt = call fast double @llvm.sqrt.f64(double %x)
   %r = fdiv fast double %x, %sqrt
@@ -487,8 +487,8 @@ define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind {
 ; CHECK-NEXT:    fmul v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    fmul v2.2d, v1.2d, v1.2d
 ; CHECK-NEXT:    frsqrts v2.2d, v0.2d, v2.2d
-; CHECK-NEXT:    fmul v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fmul v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
   %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt
@@ -513,9 +513,9 @@ define double @sqrt_fdiv_common_operand_extra_use(double %x, ptr %p) nounwind {
 ; CHECK-NEXT:    frsqrts d2, d0, d2
 ; CHECK-NEXT:    fmul d1, d1, d2
 ; CHECK-NEXT:    fmul d2, d1, d1
+; CHECK-NEXT:    fmul d1, d0, d1
 ; CHECK-NEXT:    frsqrts d2, d0, d2
 ; CHECK-NEXT:    fmul d1, d1, d2
-; CHECK-NEXT:    fmul d1, d0, d1
 ; CHECK-NEXT:    fcsel d2, d0, d1, eq
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    str d2, [x0]
@@ -671,5 +671,4 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, ptr %p1, ptr %p2, pt
   ret double %sqrt_fast
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
-attributes #1 = { "unsafe-fp-math"="true" "denormal-fp-math"="ieee" }
+attributes #0 = { "denormal-fp-math"="ieee" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
index 789385d..b770d43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
@@ -1,12 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s
 
 ---
 name:            test_f32_add_mul
@@ -24,15 +20,7 @@ body:             |
     ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -43,15 +31,7 @@ body:             |
     ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-LABEL: name: test_f32_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -62,15 +42,7 @@ body:             |
     ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -81,15 +53,6 @@ body:             |
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]]
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -100,6 +63,60 @@ body:             |
 ...
 
 ---
+name:            test_f32_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_f32_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-LABEL: name: test_f32_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %4:_(s32) = contract G_FMUL %0, %1
+    %5:_(s32) = contract G_FADD %4, %2
+    $vgpr0 = COPY %5(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
 name:            test_f32_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -115,15 +132,7 @@ body:             |
     ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -134,15 +143,7 @@ body:             |
     ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-LABEL: name: test_f32_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -153,15 +154,7 @@ body:             |
     ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -172,15 +165,6 @@ body:             |
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]]
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -191,6 +175,60 @@ body:             |
 ...
 
 ---
+name:            test_f32_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %4:_(s32) = contract G_FMUL %0, %1
+    %5:_(s32) = contract G_FADD %2, %4
+    $vgpr0 = COPY %5(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
 name: test_add_mul_multiple_defs_z
 body: |
   bb.1.entry:
@@ -209,18 +247,7 @@ body: |
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -234,18 +261,7 @@ body: |
     ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; GFX10-LABEL: name: test_add_mul_multiple_defs_z
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -259,18 +275,7 @@ body: |
     ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
     ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -284,18 +289,6 @@ body: |
     ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]]
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %4:_(s32) = COPY $vgpr2
@@ -310,6 +303,76 @@ body: |
 ...
 
 ---
+name: test_add_mul_multiple_defs_z_contract
+body: |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_add_mul_multiple_defs_z_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
+    ; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
+    ; GFX10-LABEL: name: test_add_mul_multiple_defs_z_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
+    ; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %4:_(s32) = COPY $vgpr2
+    %5:_(s32) = COPY $vgpr3
+    %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s32) = contract G_FMUL %0, %1
+    %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1)
+    %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
+    %8:_(s32) = COPY %13(s32)
+    %10:_(s32) = contract G_FADD %6, %8
+    $vgpr0 = COPY %10(s32)
+...
+
+---
 name: test_add_mul_rhs_multiple_defs_z
 body: |
   bb.1.entry:
@@ -328,18 +391,7 @@ body: |
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -353,18 +405,7 @@ body: |
     ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX9-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -378,18 +419,7 @@ body: |
     ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
     ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -403,18 +433,6 @@ body: |
     ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]]
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
-    ; GFX10-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %4:_(s32) = COPY $vgpr2
@@ -429,6 +447,76 @@ body: |
 ...
 
 ---
+name: test_add_mul_rhs_multiple_defs_z_contract
+body: |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
+    ; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
+    ; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
+    ; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %4:_(s32) = COPY $vgpr2
+    %5:_(s32) = COPY $vgpr3
+    %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s32) = contract G_FMUL %0, %1
+    %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1)
+    %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
+    %8:_(s32) = COPY %13(s32)
+    %10:_(s32) = contract G_FADD %8, %6
+    $vgpr0 = COPY %10(s32)
+...
+
+---
 name:            test_half_add_mul
 body:             |
   bb.1.entry:
@@ -448,19 +536,7 @@ body:             |
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX9-DENORM-LABEL: name: test_half_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -475,19 +551,7 @@ body:             |
     ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-LABEL: name: test_half_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -502,19 +566,7 @@ body:             |
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-DENORM-LABEL: name: test_half_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -529,19 +581,6 @@ body:             |
     ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
@@ -556,6 +595,80 @@ body:             |
 ...
 
 ---
+name:            test_half_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_half_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-LABEL: name: test_half_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %4:_(s32) = COPY $vgpr0
+    %0:_(s16) = G_TRUNC %4(s32)
+    %5:_(s32) = COPY $vgpr1
+    %1:_(s16) = G_TRUNC %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %2:_(s16) = G_TRUNC %6(s32)
+    %7:_(s16) = contract G_FMUL %0, %1
+    %8:_(s16) = contract G_FADD %7, %2
+    %10:_(s32) = G_ANYEXT %8(s16)
+    $vgpr0 = COPY %10(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
 name:            test_half_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -575,19 +688,7 @@ body:             |
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -602,19 +703,7 @@ body:             |
     ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-LABEL: name: test_half_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -629,19 +718,7 @@ body:             |
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -656,19 +733,6 @@ body:             |
     ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
@@ -683,6 +747,80 @@ body:             |
 ...
 
 ---
+name:            test_half_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %4:_(s32) = COPY $vgpr0
+    %0:_(s16) = G_TRUNC %4(s32)
+    %5:_(s32) = COPY $vgpr1
+    %1:_(s16) = G_TRUNC %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %2:_(s16) = G_TRUNC %6(s32)
+    %7:_(s16) = contract G_FMUL %0, %1
+    %8:_(s16) = contract G_FADD %2, %7
+    %10:_(s32) = G_ANYEXT %8(s16)
+    $vgpr0 = COPY %10(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
 name:            test_double_add_mul
 body:             |
   bb.1.entry:
@@ -706,23 +844,7 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX9-DENORM-LABEL: name: test_double_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -741,23 +863,7 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX10-LABEL: name: test_double_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -776,23 +882,7 @@ body:             |
     ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX10-DENORM-LABEL: name: test_double_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -811,23 +901,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -846,6 +919,101 @@ body:             |
 ...
 
 ---
+name:            test_double_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_double_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_double_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %10:_(s64) = contract G_FMUL %0, %1
+    %11:_(s64) = contract G_FADD %10, %2
+    %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+    $vgpr0 = COPY %13(s32)
+    $vgpr1 = COPY %14(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+
+---
 name:            test_double_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -869,23 +1037,7 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -904,23 +1056,7 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX10-LABEL: name: test_double_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -939,23 +1075,7 @@ body:             |
     ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -974,23 +1094,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -1009,6 +1112,100 @@ body:             |
 ...
 
 ---
+name:            test_double_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %10:_(s64) = contract G_FMUL %0, %1
+    %11:_(s64) = contract G_FADD %2, %10
+    %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+    $vgpr0 = COPY %13(s32)
+    $vgpr1 = COPY %14(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
 name:            test_4xfloat_add_mul
 body:             |
   bb.1.entry:
@@ -1040,32 +1237,7 @@ body:             |
     ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
     ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1092,32 +1264,7 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
     ; GFX10-LABEL: name: test_4xfloat_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; GFX10-NEXT: {{  $}}
@@ -1144,32 +1291,7 @@ body:             |
     ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
     ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1196,32 +1318,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1248,6 +1344,144 @@ body:             |
 ...
 
 ---
+name:            test_4xfloat_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+
+    ; GFX9-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
+    ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
+    ; GFX10-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
+    ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32)
+    %12:_(s32) = COPY $vgpr8
+    %13:_(s32) = COPY $vgpr9
+    %14:_(s32) = COPY $vgpr10
+    %15:_(s32) = COPY $vgpr11
+    %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+    %16:_(<4 x s32>) = contract G_FMUL %0, %1
+    %17:_(<4 x s32>) = contract G_FADD %16, %2
+    %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
+    $vgpr0 = COPY %19(s32)
+    $vgpr1 = COPY %20(s32)
+    $vgpr2 = COPY %21(s32)
+    $vgpr3 = COPY %22(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+...
+
+---
 name:            test_3xfloat_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -1275,28 +1509,7 @@ body:             |
     ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-    ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
     ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1319,28 +1532,7 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-    ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
     ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1363,28 +1555,7 @@ body:             |
     ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-    ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
     ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1407,28 +1578,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-    ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1451,6 +1600,124 @@ body:             |
 ...
 
 ---
+name:            test_3xfloat_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+
+    ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
+    ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
+    ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
+    ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32)
+    %7:_(s32) = COPY $vgpr3
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32)
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %12:_(s32) = COPY $vgpr8
+    %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+    %13:_(<3 x s32>) = contract G_FMUL %0, %1
+    %14:_(<3 x s32>) = contract G_FADD %2, %13
+    %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
+    $vgpr0 = COPY %16(s32)
+    $vgpr1 = COPY %17(s32)
+    $vgpr2 = COPY %18(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+...
+
+---
 name:            test_4xhalf_add_mul
 body:             |
   bb.1.entry:
@@ -1474,24 +1741,7 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
-    ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1510,24 +1760,7 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
-    ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX10-LABEL: name: test_4xhalf_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -1546,24 +1779,7 @@ body:             |
     ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
-    ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
     ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1582,24 +1798,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
-    ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1618,6 +1816,105 @@ body:             |
 ...
 
 ---
+name:            test_4xhalf_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(<2 x s16>) = COPY $vgpr0
+    %5:_(<2 x s16>) = COPY $vgpr1
+    %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
+    %6:_(<2 x s16>) = COPY $vgpr2
+    %7:_(<2 x s16>) = COPY $vgpr3
+    %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>)
+    %8:_(<2 x s16>) = COPY $vgpr4
+    %9:_(<2 x s16>) = COPY $vgpr5
+    %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+    %10:_(<4 x s16>) = contract G_FMUL %0, %1
+    %11:_(<4 x s16>) = contract G_FADD %10, %2
+    %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
+    $vgpr0 = COPY %13(<2 x s16>)
+    $vgpr1 = COPY %14(<2 x s16>)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+
+---
 name:            test_3xhalf_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -1648,31 +1945,6 @@ body:             |
     ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
-    ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
-    ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1698,31 +1970,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
-    ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
-    ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -1748,31 +1995,6 @@ body:             |
     ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
-    ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
-    ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1797,31 +2019,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]]
-    ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]]
-    ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1846,6 +2043,134 @@ body:             |
 ...
 
 ---
+name:            test_3xhalf_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+    ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+    ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+    ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]]
+    ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(<2 x s16>) = COPY $vgpr0
+    %5:_(<2 x s16>) = COPY $vgpr1
+    %10:_(<2 x s16>) = G_IMPLICIT_DEF
+    %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>)
+    %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>)
+    %6:_(<2 x s16>) = COPY $vgpr2
+    %7:_(<2 x s16>) = COPY $vgpr3
+    %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>)
+    %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+    %8:_(<2 x s16>) = COPY $vgpr4
+    %9:_(<2 x s16>) = COPY $vgpr5
+    %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
+    %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+    %17:_(<3 x s16>) = contract G_FMUL %0, %1
+    %18:_(<3 x s16>) = contract G_FADD %2, %17
+    %22:_(<3 x s16>) = G_IMPLICIT_DEF
+    %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>)
+    %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
+    $vgpr0 = COPY %20(<2 x s16>)
+    $vgpr1 = COPY %21(<2 x s16>)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
 name:            test_4xdouble_add_mul
 body:             |
   bb.1.entry:
@@ -1905,60 +2230,7 @@ body:             |
     ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
     ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -2013,60 +2285,7 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
     ; GFX10-LABEL: name: test_4xdouble_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
     ; GFX10-NEXT: {{  $}}
@@ -2121,60 +2340,7 @@ body:             |
     ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
     ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -2229,60 +2395,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -2337,6 +2449,284 @@ body:             |
 ...
 
 ---
+name:            test_4xdouble_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX9-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
+    ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
+    ; GFX10-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
+    ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+    %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64)
+    %12:_(s32) = COPY $vgpr8
+    %13:_(s32) = COPY $vgpr9
+    %14:_(s32) = COPY $vgpr10
+    %15:_(s32) = COPY $vgpr11
+    %16:_(s32) = COPY $vgpr12
+    %17:_(s32) = COPY $vgpr13
+    %18:_(s32) = COPY $vgpr14
+    %19:_(s32) = COPY $vgpr15
+    %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+    %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+    %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+    %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+    %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64)
+    %20:_(s32) = COPY $vgpr16
+    %21:_(s32) = COPY $vgpr17
+    %22:_(s32) = COPY $vgpr18
+    %23:_(s32) = COPY $vgpr19
+    %24:_(s32) = COPY $vgpr20
+    %25:_(s32) = COPY $vgpr21
+    %26:_(s32) = COPY $vgpr22
+    %27:_(s32) = COPY $vgpr23
+    %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+    %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32)
+    %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
+    %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
+    %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+    %40:_(<4 x s64>) = contract G_FMUL %0, %1
+    %41:_(<4 x s64>) = contract G_FADD %40, %2
+    %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
+    $vgpr0 = COPY %43(s32)
+    $vgpr1 = COPY %44(s32)
+    $vgpr2 = COPY %45(s32)
+    $vgpr3 = COPY %46(s32)
+    $vgpr4 = COPY %47(s32)
+    $vgpr5 = COPY %48(s32)
+    $vgpr6 = COPY %49(s32)
+    $vgpr7 = COPY %50(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+...
+
+---
 name:            test_3xdouble_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -2385,49 +2775,7 @@ body:             |
     ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
     ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -2471,49 +2819,7 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
     ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
     ; GFX10-NEXT: {{  $}}
@@ -2557,49 +2863,7 @@ body:             |
     ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
     ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -2643,49 +2907,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-    ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -2727,3 +2948,226 @@ body:             |
     $vgpr5 = COPY %39(s32)
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
 ...
+
+---
+name:            test_3xdouble_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+
+    ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
+    ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
+    ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
+    ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64)
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %12:_(s32) = COPY $vgpr8
+    %13:_(s32) = COPY $vgpr9
+    %14:_(s32) = COPY $vgpr10
+    %15:_(s32) = COPY $vgpr11
+    %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+    %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+    %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+    %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64)
+    %16:_(s32) = COPY $vgpr12
+    %17:_(s32) = COPY $vgpr13
+    %18:_(s32) = COPY $vgpr14
+    %19:_(s32) = COPY $vgpr15
+    %20:_(s32) = COPY $vgpr16
+    %21:_(s32) = COPY $vgpr17
+    %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+    %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+    %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+    %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+    %31:_(<3 x s64>) = contract G_FMUL %0, %1
+    %32:_(<3 x s64>) = contract G_FADD %2, %31
+    %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
+    $vgpr0 = COPY %34(s32)
+    $vgpr1 = COPY %35(s32)
+    $vgpr2 = COPY %36(s32)
+    $vgpr3 = COPY %37(s32)
+    $vgpr4 = COPY %38(s32)
+    $vgpr5 = COPY %39(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
index 42e53be..8f9fc67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir
@@ -1,12 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s
 # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s
 
 ---
 name:            test_f32_add_mul
@@ -25,16 +21,6 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -46,16 +32,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-LABEL: name: test_f32_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -67,16 +43,6 @@ body:             |
     ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -87,16 +53,6 @@ body:             |
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]]
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -107,6 +63,60 @@ body:             |
 ...
 
 ---
+name:            test_f32_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_f32_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-LABEL: name: test_f32_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %4:_(s32) = reassoc contract G_FMUL %0, %1
+    %5:_(s32) = reassoc contract G_FADD %4, %2
+    $vgpr0 = COPY %5(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
 name:            test_f32_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -123,16 +133,6 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -144,16 +144,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-LABEL: name: test_f32_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -165,16 +155,6 @@ body:             |
     ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -185,16 +165,6 @@ body:             |
     ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]]
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
@@ -205,6 +175,60 @@ body:             |
 ...
 
 ---
+name:            test_f32_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %4:_(s32) = reassoc contract G_FMUL %0, %1
+    %5:_(s32) = reassoc contract G_FADD %2, %4
+    $vgpr0 = COPY %5(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
 name:            test_half_add_mul
 body:             |
   bb.1.entry:
@@ -225,20 +249,6 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_half_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX9-DENORM-LABEL: name: test_half_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -254,20 +264,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_half_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-LABEL: name: test_half_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -283,20 +279,6 @@ body:             |
     ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_half_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-DENORM-LABEL: name: test_half_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -311,20 +293,6 @@ body:             |
     ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_half_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
@@ -339,6 +307,81 @@ body:             |
 ...
 
 ---
+name:            test_half_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_half_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-LABEL: name: test_half_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
+    ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %4:_(s32) = COPY $vgpr0
+    %0:_(s16) = G_TRUNC %4(s32)
+    %5:_(s32) = COPY $vgpr1
+    %1:_(s16) = G_TRUNC %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %2:_(s16) = G_TRUNC %6(s32)
+    %7:_(s16) = reassoc contract G_FMUL %0, %1
+    %8:_(s16) = reassoc contract G_FADD %7, %2
+    %10:_(s32) = G_ANYEXT %8(s16)
+    $vgpr0 = COPY %10(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+
+---
 name:            test_half_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -359,20 +402,6 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -388,20 +417,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-LABEL: name: test_half_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -417,20 +432,6 @@ body:             |
     ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
-    ;
     ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -445,20 +446,84 @@ body:             |
     ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    %4:_(s32) = COPY $vgpr0
+    %0:_(s16) = G_TRUNC %4(s32)
+    %5:_(s32) = COPY $vgpr1
+    %1:_(s16) = G_TRUNC %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %2:_(s16) = G_TRUNC %6(s32)
+    %7:_(s16) = reassoc G_FMUL %0, %1
+    %8:_(s16) = reassoc G_FADD %2, %7
+    %10:_(s32) = G_ANYEXT %8(s16)
+    $vgpr0 = COPY %10(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+...
+
+---
+name:            test_half_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX9-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+    ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     ;
-    ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
-    ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; GFX10-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+    ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+    ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
+    ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]]
+    ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %4:_(s32) = COPY $vgpr0
     %0:_(s16) = G_TRUNC %4(s32)
     %5:_(s32) = COPY $vgpr1
@@ -497,24 +562,6 @@ body:             |
     ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_double_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX9-DENORM-LABEL: name: test_double_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -534,24 +581,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_double_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-LABEL: name: test_double_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -571,24 +600,6 @@ body:             |
     ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_double_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-DENORM-LABEL: name: test_double_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -607,24 +618,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_double_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -643,6 +636,100 @@ body:             |
 ...
 
 ---
+name:            test_double_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_double_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_double_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %10:_(s64) = reassoc contract G_FMUL %0, %1
+    %11:_(s64) = reassoc contract G_FADD %10, %2
+    %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+    $vgpr0 = COPY %13(s32)
+    $vgpr1 = COPY %14(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
 name:            test_double_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -667,24 +754,6 @@ body:             |
     ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -704,24 +773,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-LABEL: name: test_double_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -741,24 +792,6 @@ body:             |
     ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -777,24 +810,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
@@ -813,6 +828,100 @@ body:             |
 ...
 
 ---
+name:            test_double_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %10:_(s64) = reassoc contract G_FMUL %0, %1
+    %11:_(s64) = reassoc contract G_FADD %2, %10
+    %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64)
+    $vgpr0 = COPY %13(s32)
+    $vgpr1 = COPY %14(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
 name:            test_4xfloat_add_mul
 body:             |
   bb.1.entry:
@@ -845,32 +954,6 @@ body:             |
     ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ;
     ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -898,32 +981,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ;
     ; GFX10-LABEL: name: test_4xfloat_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; GFX10-NEXT: {{  $}}
@@ -951,32 +1008,6 @@ body:             |
     ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ;
     ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1003,32 +1034,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1055,6 +1060,140 @@ body:             |
 ...
 
 ---
+name:            test_4xfloat_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+
+    ; GFX9-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
+    ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
+    ; GFX10-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    ;
+    ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32)
+    %12:_(s32) = COPY $vgpr8
+    %13:_(s32) = COPY $vgpr9
+    %14:_(s32) = COPY $vgpr10
+    %15:_(s32) = COPY $vgpr11
+    %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32)
+    %16:_(<4 x s32>) = reassoc contract G_FMUL %0, %1
+    %17:_(<4 x s32>) = reassoc contract G_FADD %16, %2
+    %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>)
+    $vgpr0 = COPY %19(s32)
+    $vgpr1 = COPY %20(s32)
+    $vgpr2 = COPY %21(s32)
+    $vgpr3 = COPY %22(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+...
+
+---
 name:            test_3xfloat_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -1083,28 +1222,6 @@ body:             |
     ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-    ;
     ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1128,28 +1245,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-    ;
     ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1173,28 +1268,6 @@ body:             |
     ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-    ;
     ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1217,28 +1290,124 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
     ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32)
+    %7:_(s32) = COPY $vgpr3
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32)
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %12:_(s32) = COPY $vgpr8
+    %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32)
+    %13:_(<3 x s32>) = reassoc G_FMUL %0, %1
+    %14:_(<3 x s32>) = reassoc G_FADD %2, %13
+    %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>)
+    $vgpr0 = COPY %16(s32)
+    $vgpr1 = COPY %17(s32)
+    $vgpr2 = COPY %18(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+...
+
+---
+name:            test_3xfloat_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+
+    ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
+    ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
+    ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     ;
-    ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
+    ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+    ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -1285,24 +1454,6 @@ body:             |
     ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1322,24 +1473,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-LABEL: name: test_4xhalf_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -1359,24 +1492,6 @@ body:             |
     ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1395,24 +1510,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
@@ -1431,6 +1528,100 @@ body:             |
 ...
 
 ---
+name:            test_4xhalf_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(<2 x s16>) = COPY $vgpr0
+    %5:_(<2 x s16>) = COPY $vgpr1
+    %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>)
+    %6:_(<2 x s16>) = COPY $vgpr2
+    %7:_(<2 x s16>) = COPY $vgpr3
+    %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>)
+    %8:_(<2 x s16>) = COPY $vgpr4
+    %9:_(<2 x s16>) = COPY $vgpr5
+    %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>)
+    %10:_(<4 x s16>) = reassoc contract G_FMUL %0, %1
+    %11:_(<4 x s16>) = reassoc contract G_FADD %10, %2
+    %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>)
+    $vgpr0 = COPY %13(<2 x s16>)
+    $vgpr1 = COPY %14(<2 x s16>)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
 name:            test_3xhalf_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -1461,30 +1652,6 @@ body:             |
     ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
-    ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1510,30 +1677,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
-    ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -1559,30 +1702,6 @@ body:             |
     ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
-    ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
     ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -1607,30 +1726,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
     ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
-    ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
-    ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
     %4:_(<2 x s16>) = COPY $vgpr0
     %5:_(<2 x s16>) = COPY $vgpr1
     %10:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1655,6 +1750,130 @@ body:             |
 ...
 
 ---
+name:            test_3xhalf_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+    ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+    ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+    ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]]
+    ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+    ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>)
+    ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+    %4:_(<2 x s16>) = COPY $vgpr0
+    %5:_(<2 x s16>) = COPY $vgpr1
+    %10:_(<2 x s16>) = G_IMPLICIT_DEF
+    %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>)
+    %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>)
+    %6:_(<2 x s16>) = COPY $vgpr2
+    %7:_(<2 x s16>) = COPY $vgpr3
+    %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>)
+    %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>)
+    %8:_(<2 x s16>) = COPY $vgpr4
+    %9:_(<2 x s16>) = COPY $vgpr5
+    %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>)
+    %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>)
+    %17:_(<3 x s16>) = reassoc contract G_FMUL %0, %1
+    %18:_(<3 x s16>) = reassoc contract G_FADD %2, %17
+    %22:_(<3 x s16>) = G_IMPLICIT_DEF
+    %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>)
+    %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>)
+    $vgpr0 = COPY %20(<2 x s16>)
+    $vgpr1 = COPY %21(<2 x s16>)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+---
 name:            test_4xdouble_add_mul
 body:             |
   bb.1.entry:
@@ -1715,60 +1934,6 @@ body:             |
     ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ;
     ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -1824,60 +1989,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ;
     ; GFX10-LABEL: name: test_4xdouble_add_mul
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
     ; GFX10-NEXT: {{  $}}
@@ -1933,60 +2044,6 @@ body:             |
     ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ;
     ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -2041,60 +2098,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
     ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
-    ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
-    ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
-    ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
-    ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
-    ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
-    ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -2149,6 +2152,280 @@ body:             |
 ...
 
 ---
+name:            test_4xdouble_add_mul_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX9-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
+    ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
+    ; GFX10-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    ;
+    ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+    ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+    ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+    ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+    ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+    ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+    ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32)
+    ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32)
+    ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+    %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64)
+    %12:_(s32) = COPY $vgpr8
+    %13:_(s32) = COPY $vgpr9
+    %14:_(s32) = COPY $vgpr10
+    %15:_(s32) = COPY $vgpr11
+    %16:_(s32) = COPY $vgpr12
+    %17:_(s32) = COPY $vgpr13
+    %18:_(s32) = COPY $vgpr14
+    %19:_(s32) = COPY $vgpr15
+    %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+    %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+    %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+    %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+    %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64)
+    %20:_(s32) = COPY $vgpr16
+    %21:_(s32) = COPY $vgpr17
+    %22:_(s32) = COPY $vgpr18
+    %23:_(s32) = COPY $vgpr19
+    %24:_(s32) = COPY $vgpr20
+    %25:_(s32) = COPY $vgpr21
+    %26:_(s32) = COPY $vgpr22
+    %27:_(s32) = COPY $vgpr23
+    %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+    %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32)
+    %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32)
+    %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32)
+    %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64)
+    %40:_(<4 x s64>) = reassoc contract G_FMUL %0, %1
+    %41:_(<4 x s64>) = reassoc contract G_FADD %40, %2
+    %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>)
+    $vgpr0 = COPY %43(s32)
+    $vgpr1 = COPY %44(s32)
+    $vgpr2 = COPY %45(s32)
+    $vgpr3 = COPY %46(s32)
+    $vgpr4 = COPY %47(s32)
+    $vgpr5 = COPY %48(s32)
+    $vgpr6 = COPY %49(s32)
+    $vgpr7 = COPY %50(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+...
+
+---
 name:            test_3xdouble_add_mul_rhs
 body:             |
   bb.1.entry:
@@ -2198,49 +2475,6 @@ body:             |
     ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ;
-    ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX9-CONTRACT-NEXT: {{  $}}
-    ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
-    ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ;
     ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
     ; GFX9-DENORM-NEXT: {{  $}}
@@ -2285,49 +2519,6 @@ body:             |
     ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ;
-    ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX9-UNSAFE-NEXT: {{  $}}
-    ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
-    ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ;
     ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
     ; GFX10-NEXT: {{  $}}
@@ -2372,49 +2563,6 @@ body:             |
     ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     ;
-    ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX10-CONTRACT-NEXT: {{  $}}
-    ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
-    ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ;
     ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs
     ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
     ; GFX10-DENORM-NEXT: {{  $}}
@@ -2458,49 +2606,6 @@ body:             |
     ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
     ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
     ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
-    ;
-    ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs
-    ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
-    ; GFX10-UNSAFE-NEXT: {{  $}}
-    ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
-    ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-    ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
-    ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
-    ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
-    ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
-    ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
-    ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
-    ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-    ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
-    ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
-    ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
-    ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
-    ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
-    ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
-    ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
-    ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32)
-    ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32)
-    ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
     %4:_(s32) = COPY $vgpr0
     %5:_(s32) = COPY $vgpr1
     %6:_(s32) = COPY $vgpr2
@@ -2542,3 +2647,222 @@ body:             |
     $vgpr5 = COPY %39(s32)
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
 ...
+
+---
+name:            test_3xdouble_add_mul_rhs_contract
+body:             |
+  bb.1.entry:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+
+    ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
+    ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX9-DENORM-NEXT: {{  $}}
+    ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+    ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
+    ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+    ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    ;
+    ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract
+    ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17
+    ; GFX10-DENORM-NEXT: {{  $}}
+    ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+    ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+    ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+    ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+    ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+    ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+    ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+    ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+    ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+    ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+    ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
+    ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64)
+    ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+    ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+    ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+    ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+    ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+    ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+    ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
+    ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
+    ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32)
+    ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64)
+    ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
+    ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>)
+    ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32)
+    ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32)
+    ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+    %4:_(s32) = COPY $vgpr0
+    %5:_(s32) = COPY $vgpr1
+    %6:_(s32) = COPY $vgpr2
+    %7:_(s32) = COPY $vgpr3
+    %8:_(s32) = COPY $vgpr4
+    %9:_(s32) = COPY $vgpr5
+    %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32)
+    %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32)
+    %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64)
+    %10:_(s32) = COPY $vgpr6
+    %11:_(s32) = COPY $vgpr7
+    %12:_(s32) = COPY $vgpr8
+    %13:_(s32) = COPY $vgpr9
+    %14:_(s32) = COPY $vgpr10
+    %15:_(s32) = COPY $vgpr11
+    %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32)
+    %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32)
+    %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32)
+    %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64)
+    %16:_(s32) = COPY $vgpr12
+    %17:_(s32) = COPY $vgpr13
+    %18:_(s32) = COPY $vgpr14
+    %19:_(s32) = COPY $vgpr15
+    %20:_(s32) = COPY $vgpr16
+    %21:_(s32) = COPY $vgpr17
+    %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32)
+    %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32)
+    %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32)
+    %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64)
+    %31:_(<3 x s64>) = reassoc contract G_FMUL %0, %1
+    %32:_(<3 x s64>) = reassoc contract G_FADD %2, %31
+    %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>)
+    $vgpr0 = COPY %34(s32)
+    $vgpr1 = COPY %35(s32)
+    $vgpr2 = COPY %36(s32)
+    $vgpr3 = COPY %37(s32)
+    $vgpr4 = COPY %38(s32)
+    $vgpr5 = COPY %39(s32)
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 24dd535..3f6e3d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -2,11 +2,9 @@
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX9-UNSAFE %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX10-UNSAFE %s
 
 define float @test_f32_add_mul(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_add_mul:
@@ -28,12 +26,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_f32_add_mul:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_f32_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -52,7 +44,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-UNSAFE-LABEL: test_f32_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -64,6 +55,58 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
   ret float %b
 }
 
+define float @test_f32_add_mul_contract(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_f32_add_mul_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_f32_add_mul_contract:
+; GFX10-UNSAFE:       ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract float %x, %y
+  %b = fadd contract float %a, %z
+  ret float %b
+}
+
 define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
@@ -84,12 +127,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_f32_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -108,7 +145,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -120,6 +156,58 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
   ret float %b
 }
 
+define float @test_f32_add_mul_rhs_contract(float %x, float %y, float %z) {
+; GFX9-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs_contract:
+; GFX10-UNSAFE:       ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract float %x, %y
+  %b = fadd contract float %z, %a
+  ret float %b
+}
+
 define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
 ; GFX9-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9:       ; %bb.0: ; %.entry
@@ -147,14 +235,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    global_load_dword v2, v[2:3], off offset:4
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +261,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
 ; GFX10-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -198,17 +277,16 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
   ret float %b
 }
 
-define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
-; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
+define float @test_add_mul_multiple_defs_z_contract(float %x, float %y, ptr addrspace(1) %vec_ptr) {
+; GFX9-LABEL: test_add_mul_multiple_defs_z_contract:
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    global_load_dword v2, v[2:3], off offset:4
@@ -216,7 +294,7 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z_contract:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    global_load_dword v2, v[2:3], off offset:4
@@ -225,13 +303,81 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX10-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v2, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    global_load_dword v2, v[2:3], off offset:4
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_dword v2, v[2:3], off offset:4
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    global_load_dword v2, v[2:3], off offset:4
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract:
+; GFX10-UNSAFE:       ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT:    global_load_dword v2, v[2:3], off offset:4
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-UNSAFE-NEXT:    v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract float %x, %y
+  %vec = load <2 x float>, ptr addrspace(1) %vec_ptr
+  %z = extractelement <2 x float> %vec, i64 1
+  %b = fadd contract float %a, %z
+  ret float %b
+}
+
+define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
+; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v[2:3], off offset:4
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    global_load_dword v2, v[2:3], off offset:4
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    global_load_dword v2, v[2:3], off offset:4
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10:       ; %bb.0: ; %.entry
@@ -259,7 +405,6 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
 ; GFX10-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -296,12 +441,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_half_add_mul:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_half_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -321,7 +460,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-UNSAFE-LABEL: test_half_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -333,6 +471,59 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
   ret half %b
 }
 
+define half @test_half_add_mul_contract(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_half_add_mul_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_half_add_mul_contract:
+; GFX10-UNSAFE:       ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract half %x, %y
+  %b = fadd contract half %a, %z
+  ret half %b
+}
+
 define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_half_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
@@ -353,12 +544,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_half_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -378,7 +563,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v2, v0
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -390,6 +574,59 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
   ret half %b
 }
 
+define half @test_half_add_mul_rhs_contract(half %x, half %y, half %z) {
+; GFX9-LABEL: test_half_add_mul_rhs_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_half_add_mul_rhs_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs_contract:
+; GFX10-UNSAFE:       ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract half %x, %y
+  %b = fadd contract half %z, %a
+  ret half %b
+}
+
 define double @test_double_add_mul(double %x, double %y, double %z) {
 ; GFX9-LABEL: test_double_add_mul:
 ; GFX9:       ; %bb.0: ; %.entry
@@ -411,12 +648,6 @@ define double @test_double_add_mul(double %x, double %y, double %z) {
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_double_add_mul:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_double_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -436,15 +667,61 @@ define double @test_double_add_mul(double %x, double %y, double %z) {
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
+
+define double @test_double_add_mul_contract(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX10-LABEL: test_double_add_mul_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_double_add_mul_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ; GFX10-UNSAFE-LABEL: test_double_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
-  %a = fmul double %x, %y
-  %b = fadd double %a, %z
+  %a = fmul contract double %x, %y
+  %b = fadd contract double %a, %z
   ret double %b
 }
 
@@ -469,12 +746,6 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_double_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -494,15 +765,61 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul double %x, %y
+  %b = fadd double %z, %a
+  ret double %b
+}
+
+define double @test_double_add_mul_rhs_contract(double %x, double %y, double %z) {
+; GFX9-LABEL: test_double_add_mul_rhs_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs:
+; GFX10-LABEL: test_double_add_mul_rhs_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_double_add_mul_rhs_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs_contract:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
-  %a = fmul double %x, %y
-  %b = fadd double %z, %a
+  %a = fmul contract double %x, %y
+  %b = fadd contract double %z, %a
   ret double %b
 }
 
@@ -538,15 +855,6 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
 ; GFX9-DENORM-NEXT:    v_mad_f32 v3, v3, v7, v11
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v4, v8
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v1, v1, v5, v9
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v2, v2, v6, v10
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v3, v3, v7, v11
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_4xfloat_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -577,8 +885,75 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
 ; GFX10-DENORM-NEXT:    v_mad_f32 v2, v2, v6, v10
 ; GFX10-DENORM-NEXT:    v_mad_f32 v3, v3, v7, v11
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul <4 x float> %x, %y
+  %b = fadd <4 x float> %a, %z
+  ret <4 x float> %b
+}
+
+define <4 x float> @test_4xfloat_add_mul_contract(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; GFX9-LABEL: test_4xfloat_add_mul_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX9-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX9-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX9-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v4, v8
+; GFX9-DENORM-NEXT:    v_mad_f32 v1, v1, v5, v9
+; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v6, v10
+; GFX9-DENORM-NEXT:    v_mad_f32 v3, v3, v7, v11
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul:
+; GFX10-LABEL: test_4xfloat_add_mul_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX10-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX10-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX10-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xfloat_add_mul_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xfloat_add_mul_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX10-DENORM-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX10-DENORM-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX10-DENORM-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul_contract:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v4, v8
@@ -587,8 +962,8 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v3, v3, v7, v11
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
-  %a = fmul <4 x float> %x, %y
-  %b = fadd <4 x float> %a, %z
+  %a = fmul contract <4 x float> %x, %y
+  %b = fadd contract <4 x float> %a, %z
   ret <4 x float> %b
 }
 
@@ -620,14 +995,6 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v5, v8
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v3, v6
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v1, v1, v4, v7
-; GFX9-UNSAFE-NEXT:    v_fma_f32 v2, v2, v5, v8
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -654,8 +1021,68 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
 ; GFX10-DENORM-NEXT:    v_mad_f32 v1, v1, v4, v7
 ; GFX10-DENORM-NEXT:    v_mad_f32 v2, v2, v5, v8
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul <3 x float> %x, %y
+  %b = fadd <3 x float> %z, %a
+  ret <3 x float> %b
+}
+
+define <3 x float> @test_3xfloat_add_mul_rhs_contract(<3 x float> %x, <3 x float> %y, <3 x float> %z) {
+; GFX9-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX9-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX9-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
+; GFX9-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v3, v6
+; GFX9-DENORM-NEXT:    v_mad_f32 v1, v1, v4, v7
+; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v5, v8
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX10-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX10-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX10-DENORM-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX10-DENORM-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX9-UNSAFE-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v3, v6
@@ -663,8 +1090,8 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v2, v2, v5, v8
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
-  %a = fmul <3 x float> %x, %y
-  %b = fadd <3 x float> %z, %a
+  %a = fmul contract <3 x float> %x, %y
+  %b = fadd contract <3 x float> %z, %a
   ret <3 x float> %b
 }
 
@@ -694,13 +1121,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_4xhalf_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -725,7 +1145,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -738,6 +1157,70 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
   ret <4 x half> %b
 }
 
+define <4 x half> @test_4xhalf_add_mul_contract(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
+; GFX9-LABEL: test_4xhalf_add_mul_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_4xhalf_add_mul_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul_contract:
+; GFX10-UNSAFE:       ; %bb.0: ; %.entry
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract <4 x half> %x, %y
+  %b = fadd contract <4 x half> %a, %z
+  ret <4 x half> %b
+}
+
 define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 ; GFX9-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
@@ -764,13 +1247,6 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -795,16 +1271,73 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v4, v0
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul <3 x half> %x, %y
+  %b = fadd <3 x half> %z, %a
+  ret <3 x half> %b
+}
+
+define <3 x half> @test_3xhalf_add_mul_rhs_contract(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
+; GFX9-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v4, v0
+; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
+; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v4, v0
+; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
-  %a = fmul <3 x half> %x, %y
-  %b = fadd <3 x half> %z, %a
+  %a = fmul contract <3 x half> %x, %y
+  %b = fadd contract <3 x half> %z, %a
   ret <3 x half> %b
 }
 
@@ -844,15 +1377,6 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
 ; GFX9-DENORM-NEXT:    v_add_f64 v[6:7], v[6:7], v[22:23]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_4xdouble_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -887,7 +1411,14 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
 ; GFX10-DENORM-NEXT:    v_add_f64 v[4:5], v[4:5], v[20:21]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[6:7], v[6:7], v[22:23]
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
+; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ; GFX10-UNSAFE-LABEL: test_4xdouble_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -902,6 +1433,66 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
   ret <4 x double> %b
 }
 
+define <4 x double> @test_4xdouble_add_mul_contract(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
+; GFX9-LABEL: test_4xdouble_add_mul_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_4xdouble_add_mul_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_4xdouble_add_mul_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_4xdouble_add_mul_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_4xdouble_add_mul_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_4xdouble_add_mul_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX10-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX10-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract <4 x double> %x, %y
+  %b = fadd contract <4 x double> %a, %z
+  ret <4 x double> %b
+}
+
 define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, <3 x double> %z) {
 ; GFX9-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
@@ -933,14 +1524,6 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
 ; GFX9-DENORM-NEXT:    v_add_f64 v[4:5], v[16:17], v[4:5]
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
-; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
-; GFX9-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
-; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX10-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -970,7 +1553,13 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
 ; GFX10-DENORM-NEXT:    v_add_f64 v[2:3], v[14:15], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[4:5], v[16:17], v[4:5]
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
-;
+; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
+; GFX9-UNSAFE:       ; %bb.0: ; %.entry
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ; GFX10-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -983,3 +1572,57 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y,
   %b = fadd <3 x double> %z, %a
   ret <3 x double> %b
 }
+
+define <3 x double> @test_3xdouble_add_mul_rhs_contract(<3 x double> %x, <3 x double> %y, <3 x double> %z) {
+; GFX9-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9-CONTRACT:       ; %bb.0: ; %.entry
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX9-DENORM:       ; %bb.0: ; %.entry
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10-CONTRACT:       ; %bb.0: ; %.entry
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract:
+; GFX10-DENORM:       ; %bb.0: ; %.entry
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX10-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
+.entry:
+  %a = fmul contract <3 x double> %x, %y
+  %b = fadd contract <3 x double> %z, %a
+  ret <3 x double> %b
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index 2845a63..d9ac9a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -24,8 +24,8 @@ body: |
     %ptr:_(p1) = COPY $vgpr2_vgpr3
     %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
     %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
-    %6:_(s32) = G_FMUL %0, %1
-    %7:_(s32) = G_FADD %6, %el1
+    %6:_(s32) = contract G_FMUL %0, %1
+    %7:_(s32) = contract G_FADD %6, %el1
     $vgpr0 = COPY %7(s32)
 ...
 
@@ -54,8 +54,8 @@ body: |
     %ptr:_(p1) = COPY $vgpr2_vgpr3
     %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
     %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
-    %6:_(s32) = G_FMUL %0, %1
-    %7:_(s32) = G_FADD %el1, %6
+    %6:_(s32) = contract G_FMUL %0, %1
+    %7:_(s32) = contract G_FADD %el1, %6
     $vgpr0 = COPY %7(s32)
 ...
 
@@ -233,10 +233,10 @@ body: |
     %7:_(s16) = G_TRUNC %6(s32)
     %8:_(s32) = COPY $vgpr5
     %9:_(s16) = G_TRUNC %8(s32)
-    %10:_(s16) = G_FMUL %7, %9
+    %10:_(s16) = contract G_FMUL %7, %9
     %11:_(s32) = G_FPEXT %10(s16)
     %12:_(s32) = G_FMA %0, %1, %11
-    %13:_(s32) = G_FADD %12, %el1
+    %13:_(s32) = contract G_FADD %12, %el1
     $vgpr0 = COPY %13(s32)
 ...
 
@@ -282,11 +282,11 @@ body: |
     %9:_(s16) = G_TRUNC %8(s32)
     %10:_(s32) = COPY $vgpr5
     %11:_(s16) = G_TRUNC %10(s32)
-    %12:_(s16) = G_FMUL %9, %11
-    %13:_(s16) = G_FMUL %1, %3
-    %14:_(s16) = G_FADD %13, %12
+    %12:_(s16) = contract G_FMUL %9, %11
+    %13:_(s16) = contract G_FMUL %1, %3
+    %14:_(s16) = contract G_FADD %13, %12
     %15:_(s32) = G_FPEXT %14(s16)
-    %16:_(s32) = G_FADD %15, %el1
+    %16:_(s32) = contract G_FADD %15, %el1
     $vgpr0 = COPY %16(s32)
 ...
 
@@ -326,10 +326,10 @@ body: |
     %7:_(s16) = G_TRUNC %6(s32)
     %8:_(s32) = COPY $vgpr5
     %9:_(s16) = G_TRUNC %8(s32)
-    %10:_(s16) = G_FMUL %7, %9
+    %10:_(s16) = contract G_FMUL %7, %9
     %11:_(s32) = G_FPEXT %10(s16)
     %12:_(s32) = G_FMA %4, %5, %11
-    %13:_(s32) = G_FADD %el1, %12
+    %13:_(s32) = contract G_FADD %el1, %12
     $vgpr0 = COPY %13(s32)
 ...
 
@@ -375,11 +375,11 @@ body: |
     %9:_(s16) = G_TRUNC %8(s32)
     %10:_(s32) = COPY $vgpr5
     %11:_(s16) = G_TRUNC %10(s32)
-    %12:_(s16) = G_FMUL %9, %11
-    %13:_(s16) = G_FMUL %5, %7
-    %14:_(s16) = G_FADD %13, %12
+    %12:_(s16) = contract G_FMUL %9, %11
+    %13:_(s16) = contract G_FMUL %5, %7
+    %14:_(s16) = contract G_FADD %13, %12
     %15:_(s32) = G_FPEXT %14(s16)
-    %16:_(s32) = G_FADD %el1, %15
+    %16:_(s32) = contract G_FADD %el1, %15
     $vgpr0 = COPY %16(s32)
 ...
 
@@ -409,8 +409,8 @@ body: |
     %ptr:_(p1) = COPY $vgpr0_vgpr1
     %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
     %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
-    %6:_(s32) = G_FMUL %0, %1
-    %7:_(s32) = G_FSUB %6, %el1
+    %6:_(s32) = contract G_FMUL %0, %1
+    %7:_(s32) = contract G_FSUB %6, %el1
     $vgpr0 = COPY %7(s32)
 ...
 
@@ -440,7 +440,7 @@ body: |
     %ptr:_(p1) = COPY $vgpr2_vgpr3
     %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
     %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
-    %6:_(s32) = G_FMUL %0, %1
-    %7:_(s32) = G_FSUB %el1, %6
+    %6:_(s32) = contract G_FMUL %0, %1
+    %7:_(s32) = contract G_FSUB %el1, %6
     $vgpr0 = COPY %7(s32)
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index c4d57ac..da25ac0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
   ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX942-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+  ; GFX942-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
   ; GFX942-NEXT:   S_ENDPGM 0
   ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
@@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
   ; GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX11-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+  ; GFX11-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
   ; GFX11-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
   ret void
@@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
   ; GFX942-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX942-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX942-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+  ; GFX942-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
   ; GFX942-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX942-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   ;
@@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
   ; GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX11-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr)
+  ; GFX11-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
   ; GFX11-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
index c82ae2fb..bf36979 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
@@ -13,7 +13,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
   ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+  ; GFX90A_GFX942-NEXT:   FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0)
   ; GFX90A_GFX942-NEXT:   S_ENDPGM 0
   %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -30,7 +30,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
   ; GFX90A_GFX942-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A_GFX942-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; GFX90A_GFX942-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX90A_GFX942-NEXT:   [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+  ; GFX90A_GFX942-NEXT:   [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0)
   ; GFX90A_GFX942-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
   ; GFX90A_GFX942-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
   ; GFX90A_GFX942-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 8192d4a..0e132f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -85,12 +85,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -153,12 +153,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -168,12 +168,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -236,12 +236,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -319,12 +319,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -334,12 +334,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -402,12 +402,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -417,12 +417,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -485,12 +485,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -500,12 +500,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -568,12 +568,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -583,12 +583,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -651,12 +651,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -666,12 +666,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -734,12 +734,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -749,12 +749,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -817,12 +817,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -832,12 +832,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -900,12 +900,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -915,12 +915,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -983,12 +983,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -998,12 +998,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
index f513de8..477ef32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir
@@ -385,117 +385,16 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
-    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[C2]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C9]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD]]
-    ; CHECK-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
-    ; CHECK-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
-    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[C10]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
-    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
-    ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
-    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
-    ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
-    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
-    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD]](s32), [[C17]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
-    ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
-    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C18]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
-    ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32)
-    ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
-    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
-    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
-    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32)
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[C2]]
-    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]]
-    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]]
-    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]]
-    ; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1)
-    ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]]
-    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]]
-    ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ADD2]], [[C9]](s32)
-    ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]]
-    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD2]]
-    ; CHECK-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]]
-    ; CHECK-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]]
-    ; CHECK-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]]
-    ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32)
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32)
-    ; CHECK-NEXT: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]]
-    ; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1)
-    ; CHECK-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]]
-    ; CHECK-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD2]](s32), [[C10]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]]
-    ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32)
-    ; CHECK-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]]
-    ; CHECK-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1)
-    ; CHECK-NEXT: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]]
-    ; CHECK-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1)
-    ; CHECK-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]]
-    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]]
-    ; CHECK-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD2]](s32), [[C17]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]]
-    ; CHECK-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD2]](s32), [[C18]]
-    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]]
-    ; CHECK-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32)
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]]
-    ; CHECK-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]]
-    ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[OR7]], [[C21]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[OR15]], [[C21]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32)
-    ; CHECK-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]]
-    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV]](s64)
+    ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC]](s32)
+    ; CHECK-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV1]](s64)
+    ; CHECK-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC2]](s32)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
+    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC3]](s16)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<2 x s16>) = afn G_FPTRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
index 1a457c9..9241a23 100644
--- a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
+++ b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir
@@ -38,20 +38,20 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
     ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
-    ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 1065353216, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr6 = V_MOV_B32_e32 1073741824, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1073741824, implicit $exec
     ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
-    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $agpr1
-    ; CHECK-NEXT: renamable $vgpr0 = COPY renamable $agpr0
-    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $agpr1
-    ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $agpr0
-    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
-    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr4, killed $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr6 = COPY renamable $agpr1
+    ; CHECK-NEXT: renamable $vgpr5 = COPY renamable $agpr0
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr8 = COPY renamable $agpr1
+    ; CHECK-NEXT: renamable $vgpr7 = COPY killed renamable $agpr0
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr5_vgpr6_vgpr7_vgpr8
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3
-    ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr4, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
     renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 1adf542..9979e83 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -323,6 +323,146 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
   ret void
 }
 
+define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT:    ; return to shader part epilog
+  %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+  %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+  ret bfloat %clamp
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
+; GCN-LABEL: test_clamp_bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT:    ; return to shader part epilog
+  %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
+  %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+  ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
+; GCN-LABEL: test_clamp_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT:    ; return to shader part epilog
+  %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+  %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+  %ret = bitcast <2 x bfloat> %clamp to float
+  ret float %ret
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
+; GCN-LABEL: test_clamp_v2bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_max_num_bf16 v0, s0, s0 clamp
+; GCN-NEXT:    ; return to shader part epilog
+  %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+  %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+  %ret = bitcast <2 x bfloat> %clamp to float
+  ret float %ret
+}
+
+define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
+; GCN-LABEL: test_clamp_bf16_folding:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_exp_bf16_e32 v0, v0
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT:    ; return to shader part epilog
+  %exp = call bfloat @llvm.exp2.bf16(bfloat %src)
+  %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
+  %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0)
+  ret bfloat %clamp
+}
+
+define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
+; GCN-LABEL: test_clamp_v2bf16_folding:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_bf16 v0, v0, v1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT:    ; return to shader part epilog
+  %mul = fmul <2 x bfloat> %src0, %src1
+  %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+  %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+  %ret = bitcast <2 x bfloat> %clamp to float
+  ret float %ret
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_bf16 v2, v2, v3
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_pk_add_bf16 v2, v2, v4
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %mul = fmul contract <2 x bfloat> %a, %b
+  %add = fadd contract <2 x bfloat> %mul, %c
+  store <2 x bfloat> %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vss:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_pk_add_bf16 v2, v2, s1
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %mul = fmul contract <2 x bfloat> %a, %b
+  %add = fadd contract <2 x bfloat> %mul, %c
+  store <2 x bfloat> %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
+; GCN-LABEL: v_test_mul_add_v2bf16_sss:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_bf16 v2, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_pk_add_bf16 v2, v2, s2
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %mul = fmul contract <2 x bfloat> %a, %b
+  %add = fadd contract <2 x bfloat> %mul, %c
+  store <2 x bfloat> %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_bf16 v2, v2, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %mul = fmul contract <2 x bfloat> %a, %b
+  %add = fadd contract <2 x bfloat> %mul, <bfloat 0.5, bfloat 0.5>
+  store <2 x bfloat> %add, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
+; GCN-LABEL: v_test_mul_add_v2bf16_vll:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_bf16 v2, 0x42c83f80, v2
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_pk_add_bf16 v2, 0x43484000, v2
+; GCN-NEXT:    global_store_b32 v[0:1], v2, off
+; GCN-NEXT:    s_endpgm
+  %mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
+  %add = fadd contract <2 x bfloat> %mul, <bfloat 2.0, bfloat 200.0>
+  store <2 x bfloat> %add, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
 ; GCN-LABEL: v_test_fma_v2bf16_vvv:
 ; GCN:       ; %bb.0:
@@ -426,6 +566,8 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
   ret void
 }
 
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
 declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
 declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
 declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 7859fcdf..52e697c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -468,15 +468,28 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_load_global_v16bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v9, v1
+; GFX900-NEXT:    v_mov_b32_e32 v8, v0
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX900-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX950-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT:    s_waitcnt vmcnt(1)
+; GFX950-NEXT:    v_mov_b32_e32 v0, v8
+; GFX950-NEXT:    v_mov_b32_e32 v1, v9
+; GFX950-NEXT:    v_mov_b32_e32 v2, v10
+; GFX950-NEXT:    v_mov_b32_e32 v3, v11
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_load_global_v16bf16:
 ; GFX10:       ; %bb.0:
@@ -619,17 +632,32 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_load_global_v32bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v17, v1
-; GFX9-NEXT:    v_mov_b32_e32 v16, v0
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v17, v1
+; GFX900-NEXT:    v_mov_b32_e32 v16, v0
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
+; GFX900-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX900-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX900-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX950-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT:    s_waitcnt vmcnt(3)
+; GFX950-NEXT:    v_mov_b32_e32 v0, v16
+; GFX950-NEXT:    v_mov_b32_e32 v1, v17
+; GFX950-NEXT:    v_mov_b32_e32 v2, v18
+; GFX950-NEXT:    v_mov_b32_e32 v3, v19
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_load_global_v32bf16:
 ; GFX10:       ; %bb.0:
@@ -877,22 +905,41 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_load_global_v64bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v29, v1
-; GFX9-NEXT:    v_mov_b32_e32 v28, v0
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_load_global_v64bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v29, v1
+; GFX900-NEXT:    v_mov_b32_e32 v28, v0
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX900-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX900-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX900-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX900-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX900-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX900-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX900-NEXT:    s_nop 0
+; GFX900-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_load_global_v64bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
+; GFX950-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX950-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:32
+; GFX950-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:48
+; GFX950-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:64
+; GFX950-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:80
+; GFX950-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:96
+; GFX950-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:112
+; GFX950-NEXT:    s_waitcnt vmcnt(7)
+; GFX950-NEXT:    v_mov_b32_e32 v0, v32
+; GFX950-NEXT:    v_mov_b32_e32 v1, v33
+; GFX950-NEXT:    v_mov_b32_e32 v2, v34
+; GFX950-NEXT:    v_mov_b32_e32 v3, v35
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_load_global_v64bf16:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 7208eae..763f436 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -51,11 +51,11 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
 ; GFX942-LABEL: build_vector2:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 5
-; GFX942-NEXT:    v_mov_b32_e32 v1, 6
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 5
+; GFX942-NEXT:    v_mov_b32_e32 v3, 6
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 entry:
   store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -116,13 +116,13 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
 ; GFX942-LABEL: build_vector4:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 5
-; GFX942-NEXT:    v_mov_b32_e32 v1, 6
-; GFX942-NEXT:    v_mov_b32_e32 v2, 7
-; GFX942-NEXT:    v_mov_b32_e32 v3, 8
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 5
+; GFX942-NEXT:    v_mov_b32_e32 v3, 6
+; GFX942-NEXT:    v_mov_b32_e32 v4, 7
+; GFX942-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -307,13 +307,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ; GFX942-LABEL: build_v2i32_from_v4i16_shuffle:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX942-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, s2
-; GFX942-NEXT:    v_mov_b32_e32 v1, s3
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
index 8781196..4f752d1 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
@@ -8,10 +8,10 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
 ; GFX942-LABEL: global_load_lds_dword_saddr:
 ; GFX942:       ; %bb.0: ; %main_body
 ; GFX942-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX942-NEXT:    s_mov_b32 m0, s2
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    global_load_lds_dword v2, s[0:1] offset:32 nt
+; GFX942-NEXT:    global_load_lds_dword v1, s[0:1] offset:32 nt
 ; GFX942-NEXT:    s_getpc_b64 s[0:1]
 ; GFX942-NEXT:    s_add_u32 s0, s0, G@gotpcrel32@lo+4
 ; GFX942-NEXT:    s_addc_u32 s1, s1, G@gotpcrel32@hi+12
@@ -21,9 +21,9 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_mul_i32 s3, s3, 10
 ; GFX942-NEXT:    s_mul_i32 s2, s2, 10
-; GFX942-NEXT:    v_mov_b32_e32 v0, s2
-; GFX942-NEXT:    v_mov_b32_e32 v1, s3
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: global_load_lds_dword_saddr:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 873fcee..6067194 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -71,12 +71,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
@@ -86,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -154,12 +154,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -169,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -237,12 +237,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
@@ -252,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -320,12 +320,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
@@ -335,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -403,12 +403,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
@@ -418,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -486,12 +486,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -501,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
@@ -584,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -652,12 +652,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
@@ -667,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -735,12 +735,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
@@ -750,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
@@ -818,12 +818,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -833,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
@@ -901,12 +901,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
@@ -916,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
@@ -984,12 +984,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
@@ -999,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x44
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-NEXT:    buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
 ; GFX942-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index d0b41e1..57b4857 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
 
 define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; SI-SDAG-LABEL: fptrunc_f32_to_f16:
@@ -201,8 +201,8 @@ entry:
   ret void
 }
 
-define amdgpu_kernel void @fptrunc_f64_to_f16(
-; SI-SDAG-LABEL: fptrunc_f64_to_f16:
+define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
+; SI-SDAG-LABEL: fptrunc_f32_to_f16_afn:
 ; SI-SDAG:       ; %bb.0: ; %entry
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
@@ -212,29 +212,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s8, s2
 ; SI-SDAG-NEXT:    s_mov_b32 s9, s3
-; SI-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; SI-SDAG-NEXT:    s_mov_b32 s5, s1
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
-; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL-LABEL: fptrunc_f32_to_f16_afn:
 ; SI-GISEL:       ; %bb.0: ; %entry
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
-; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
-; VI-SDAG-LABEL: fptrunc_f64_to_f16:
+; VI-SDAG-LABEL: fptrunc_f32_to_f16_afn:
 ; VI-SDAG:       ; %bb.0: ; %entry
 ; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
@@ -244,29 +242,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s8, s2
 ; VI-SDAG-NEXT:    s_mov_b32 s9, s3
-; VI-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, s0
 ; VI-SDAG-NEXT:    s_mov_b32 s5, s1
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-SDAG-NEXT:    s_endpgm
 ;
-; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL-LABEL: fptrunc_f32_to_f16_afn:
 ; VI-GISEL:       ; %bb.0: ; %entry
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; VI-GISEL-NEXT:    s_mov_b32 s2, -1
-; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_afn:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
@@ -276,29 +272,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
-; GFX9-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_afn:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
 ; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX950-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX950-SDAG-LABEL: fptrunc_f32_to_f16_afn:
 ; GFX950-SDAG:       ; %bb.0: ; %entry
 ; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX950-SDAG-NEXT:    s_mov_b32 s7, 0xf000
@@ -308,23 +302,541 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_mov_b32 s8, s2
 ; GFX950-SDAG-NEXT:    s_mov_b32 s9, s3
-; GFX950-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX950-SDAG-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; GFX950-SDAG-NEXT:    s_mov_b32 s4, s0
 ; GFX950-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX950-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX950-SDAG-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: fptrunc_f32_to_f16_afn:
+; GFX950-GISEL:       ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+    ptr addrspace(1) %a) {
+entry:
+  %a.val = load float, ptr addrspace(1) %a
+  %r.val = fptrunc afn float %a.val to half
+  store half %r.val, ptr addrspace(1) %r
+  ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16(
+; SI-SDAG-LABEL: fptrunc_f64_to_f16:
+; SI-SDAG:       ; %bb.0: ; %entry
+; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    s_mov_b32 s10, s2
+; SI-SDAG-NEXT:    s_mov_b32 s11, s3
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s8, s6
+; SI-SDAG-NEXT:    s_mov_b32 s9, s7
+; SI-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT:    s_and_b32 s6, s1, 0x1ff
+; SI-SDAG-NEXT:    s_lshr_b32 s7, s1, 8
+; SI-SDAG-NEXT:    s_bfe_u32 s8, s1, 0xb0014
+; SI-SDAG-NEXT:    v_or_b32_e32 v0, s6, v0
+; SI-SDAG-NEXT:    s_and_b32 s6, s7, 0xffe
+; SI-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; SI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; SI-SDAG-NEXT:    s_or_b32 s7, s6, 0x1000
+; SI-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; SI-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; SI-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; SI-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; SI-SDAG-NEXT:    s_or_b32 s9, s6, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; SI-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; SI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; SI-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; SI-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, s7
+; SI-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT:    s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT:    s_or_b32 s6, s1, s0
+; SI-SDAG-NEXT:    s_mov_b32 s0, s4
+; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_bfe_u32 s3, s5, 0xb0014
+; SI-GISEL-NEXT:    s_lshr_b32 s6, s5, 8
+; SI-GISEL-NEXT:    s_and_b32 s7, s5, 0x1ff
+; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
+; SI-GISEL-NEXT:    s_and_b32 s6, s6, 0xffe
+; SI-GISEL-NEXT:    s_or_b32 s4, s7, s4
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s4, s6, s4
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
+; SI-GISEL-NEXT:    s_lshl_b32 s7, s3, 12
+; SI-GISEL-NEXT:    s_sub_i32 s8, 1, s3
+; SI-GISEL-NEXT:    s_or_b32 s9, s4, 0x1000
+; SI-GISEL-NEXT:    s_or_b32 s6, s6, 0x7c00
+; SI-GISEL-NEXT:    s_or_b32 s4, s4, s7
+; SI-GISEL-NEXT:    s_max_i32 s7, s8, 0
+; SI-GISEL-NEXT:    s_min_i32 s7, s7, 13
+; SI-GISEL-NEXT:    s_lshr_b32 s8, s9, s7
+; SI-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s7, s9
+; SI-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s7, s8, s7
+; SI-GISEL-NEXT:    s_cmp_lt_i32 s3, 1
+; SI-GISEL-NEXT:    s_cselect_b32 s4, s7, s4
+; SI-GISEL-NEXT:    s_and_b32 s7, s4, 7
+; SI-GISEL-NEXT:    s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT:    s_cmp_eq_u32 s7, 3
+; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT:    s_cmp_gt_i32 s7, 5
+; SI-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s7, s8, s7
+; SI-GISEL-NEXT:    s_add_i32 s4, s4, s7
+; SI-GISEL-NEXT:    s_cmp_gt_i32 s3, 30
+; SI-GISEL-NEXT:    s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; SI-GISEL-NEXT:    s_cselect_b32 s3, s6, s4
+; SI-GISEL-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
+; SI-GISEL-NEXT:    s_or_b32 s4, s4, s3
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f16:
+; VI-SDAG:       ; %bb.0: ; %entry
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SDAG-NEXT:    s_mov_b32 s10, s2
+; VI-SDAG-NEXT:    s_mov_b32 s11, s3
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s8, s6
+; VI-SDAG-NEXT:    s_mov_b32 s9, s7
+; VI-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT:    s_mov_b32 s0, s4
+; VI-SDAG-NEXT:    s_mov_b32 s1, s5
+; VI-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s4, v1
+; VI-SDAG-NEXT:    s_and_b32 s5, s4, 0x1ff
+; VI-SDAG-NEXT:    v_or_b32_e32 v0, s5, v0
+; VI-SDAG-NEXT:    s_lshr_b32 s7, s4, 8
+; VI-SDAG-NEXT:    s_bfe_u32 s8, s4, 0xb0014
+; VI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT:    s_and_b32 s5, s7, 0xffe
+; VI-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT:    s_or_b32 s5, s5, s7
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT:    s_or_b32 s7, s5, 0x1000
+; VI-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; VI-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; VI-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; VI-SDAG-NEXT:    s_or_b32 s9, s5, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; VI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; VI-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; VI-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s5, 0
+; VI-SDAG-NEXT:    s_cselect_b32 s5, s6, 0x7c00
+; VI-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; VI-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
+; VI-SDAG-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT:    s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT:    s_or_b32 s4, s4, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL:       ; %bb.0: ; %entry
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; VI-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
+; VI-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
+; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
+; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
+; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
+; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
+; VI-GISEL-NEXT:    s_max_i32 s7, s7, 0
+; VI-GISEL-NEXT:    s_or_b32 s6, s2, s6
+; VI-GISEL-NEXT:    s_min_i32 s7, s7, 13
+; VI-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; VI-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; VI-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
+; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s2, s8, s2
+; VI-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; VI-GISEL-NEXT:    s_cselect_b32 s2, s2, s6
+; VI-GISEL-NEXT:    s_and_b32 s6, s2, 7
+; VI-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
+; VI-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s6, s7, s6
+; VI-GISEL-NEXT:    s_add_i32 s2, s2, s6
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; VI-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; VI-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; VI-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
+; VI-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
+; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s5, 0x1ff
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s7, s5, 8
+; GFX9-SDAG-NEXT:    s_bfe_u32 s8, s5, 0xb0014
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s7, 0xffe
+; GFX9-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s6, 0x1000
+; GFX9-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s6, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; GFX9-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; GFX9-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX9-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX9-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX9-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT:    s_sub_i32 s7, 1, s4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
+; GFX9-GISEL-NEXT:    s_max_i32 s7, s7, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s6, s2, s6
+; GFX9-GISEL-NEXT:    s_min_i32 s7, s7, 13
+; GFX9-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX9-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; GFX9-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX9-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s8, s2
+; GFX9-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX9-GISEL-NEXT:    s_and_b32 s6, s2, 7
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX9-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s6, s7, s6
+; GFX9-GISEL-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX9-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX950-SDAG:       ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT:    s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX950-SDAG-NEXT:    s_and_b32 s6, s5, 0x1ff
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s7, s5, 8
+; GFX950-SDAG-NEXT:    s_bfe_u32 s8, s5, 0xb0014
+; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    s_and_b32 s6, s7, 0xffe
+; GFX950-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s6, 0x1000
+; GFX950-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX950-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s6, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; GFX950-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; GFX950-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; GFX950-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX950-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT:    s_endpgm
+;
 ; GFX950-GISEL-LABEL: fptrunc_f64_to_f16:
 ; GFX950-GISEL:       ; %bb.0: ; %entry
 ; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX950-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX950-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX950-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX950-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX950-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX950-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT:    s_sub_i32 s7, 1, s4
+; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
+; GFX950-GISEL-NEXT:    s_max_i32 s7, s7, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s6, s2, s6
+; GFX950-GISEL-NEXT:    s_min_i32 s7, s7, 13
+; GFX950-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; GFX950-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX950-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; GFX950-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX950-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
+; GFX950-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s2, s8, s2
+; GFX950-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX950-GISEL-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX950-GISEL-NEXT:    s_and_b32 s6, s2, 7
+; GFX950-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX950-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX950-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX950-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX950-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s6, s7, s6
+; GFX950-GISEL-NEXT:    s_add_i32 s2, s2, s6
+; GFX950-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX950-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX950-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX950-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX950-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX950-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX950-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX950-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX950-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -340,13 +852,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s9, s3
-; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
 ;
@@ -360,13 +919,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s9, s3
-; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s4, s0
 ; GFX11-SDAG-FAKE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
 ;
@@ -376,6 +982,555 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-TRUE16-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-TRUE16-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s9, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_add_i32 s2, s2, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-FAKE16-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-FAKE16-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s9, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_add_i32 s2, s2, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
+entry:
+  %a.val = load double, ptr addrspace(1) %a
+  %r.val = fptrunc double %a.val to half
+  store half %r.val, ptr addrspace(1) %r
+  ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
+; SI-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; SI-SDAG:       ; %bb.0: ; %entry
+; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    s_mov_b32 s10, s2
+; SI-SDAG-NEXT:    s_mov_b32 s11, s3
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s8, s6
+; SI-SDAG-NEXT:    s_mov_b32 s9, s7
+; SI-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT:    s_and_b32 s6, s1, 0x1ff
+; SI-SDAG-NEXT:    s_lshr_b32 s7, s1, 8
+; SI-SDAG-NEXT:    s_bfe_u32 s8, s1, 0xb0014
+; SI-SDAG-NEXT:    v_or_b32_e32 v0, s6, v0
+; SI-SDAG-NEXT:    s_and_b32 s6, s7, 0xffe
+; SI-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; SI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; SI-SDAG-NEXT:    s_or_b32 s7, s6, 0x1000
+; SI-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; SI-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; SI-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; SI-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; SI-SDAG-NEXT:    s_or_b32 s9, s6, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; SI-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; SI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; SI-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; SI-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, s7
+; SI-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT:    s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT:    s_or_b32 s6, s1, s0
+; SI-SDAG-NEXT:    s_mov_b32 s0, s4
+; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-SDAG:       ; %bb.0: ; %entry
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SDAG-NEXT:    s_mov_b32 s10, s2
+; VI-SDAG-NEXT:    s_mov_b32 s11, s3
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s8, s6
+; VI-SDAG-NEXT:    s_mov_b32 s9, s7
+; VI-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; VI-SDAG-NEXT:    s_mov_b32 s0, s4
+; VI-SDAG-NEXT:    s_mov_b32 s1, s5
+; VI-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s4, v1
+; VI-SDAG-NEXT:    s_and_b32 s5, s4, 0x1ff
+; VI-SDAG-NEXT:    v_or_b32_e32 v0, s5, v0
+; VI-SDAG-NEXT:    s_lshr_b32 s7, s4, 8
+; VI-SDAG-NEXT:    s_bfe_u32 s8, s4, 0xb0014
+; VI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT:    s_and_b32 s5, s7, 0xffe
+; VI-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT:    s_or_b32 s5, s5, s7
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT:    s_or_b32 s7, s5, 0x1000
+; VI-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; VI-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; VI-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; VI-SDAG-NEXT:    s_or_b32 s9, s5, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; VI-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; VI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; VI-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; VI-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s5, 0
+; VI-SDAG-NEXT:    s_cselect_b32 s5, s6, 0x7c00
+; VI-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; VI-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
+; VI-SDAG-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT:    s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT:    s_or_b32 s4, s4, s5
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; VI-GISEL:       ; %bb.0: ; %entry
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s5, 0x1ff
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s7, s5, 8
+; GFX9-SDAG-NEXT:    s_bfe_u32 s8, s5, 0xb0014
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s7, 0xffe
+; GFX9-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s6, 0x1000
+; GFX9-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s6, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; GFX9-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; GFX9-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX9-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX950-SDAG:       ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT:    s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX950-SDAG-NEXT:    s_and_b32 s6, s5, 0x1ff
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s7, s5, 8
+; GFX950-SDAG-NEXT:    s_bfe_u32 s8, s5, 0xb0014
+; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    s_and_b32 s6, s7, 0xffe
+; GFX950-SDAG-NEXT:    s_sub_i32 s7, 0x3f1, s8
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_med3_i32 v1, s7, 0, 13
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s6, 0x1000
+; GFX950-SDAG-NEXT:    s_lshr_b32 s10, s7, s9
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s10, s9
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s9, s7
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX950-SDAG-NEXT:    s_addk_i32 s8, 0xfc10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s8, 12
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s10, s7
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s6, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s8, 1
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s7, s9
+; GFX950-SDAG-NEXT:    s_and_b32 s9, s7, 7
+; GFX950-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s7, s7, 2
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s9, s10
+; GFX950-SDAG-NEXT:    s_add_i32 s7, s7, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s8, 31
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s7, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX950-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX950-GISEL:       ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX950-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX950-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s4, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s4, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
 ; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -384,7 +1539,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -401,7 +1556,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
     ptr addrspace(1) %a) {
 entry:
   %a.val = load double, ptr addrspace(1) %a
-  %r.val = fptrunc double %a.val to half
+  %r.val = fptrunc afn double %a.val to half
   store half %r.val, ptr addrspace(1) %r
   ret void
 }
@@ -626,25 +1781,106 @@ entry:
 define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
 ; SI-SDAG:       ; %bb.0: ; %entry
-; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; SI-SDAG-NEXT:    s_mov_b32 s6, -1
-; SI-SDAG-NEXT:    s_mov_b32 s10, s6
-; SI-SDAG-NEXT:    s_mov_b32 s11, s7
+; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    s_mov_b32 s10, s2
+; SI-SDAG-NEXT:    s_mov_b32 s11, s3
 ; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-SDAG-NEXT:    s_mov_b32 s8, s2
-; SI-SDAG-NEXT:    s_mov_b32 s9, s3
+; SI-SDAG-NEXT:    s_mov_b32 s8, s6
+; SI-SDAG-NEXT:    s_mov_b32 s9, s7
 ; SI-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-SDAG-NEXT:    s_mov_b32 s4, s0
-; SI-SDAG-NEXT:    s_mov_b32 s5, s1
+; SI-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SI-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
-; SI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; SI-SDAG-NEXT:    s_and_b32 s7, s1, 0x1ff
+; SI-SDAG-NEXT:    s_lshr_b32 s8, s1, 8
+; SI-SDAG-NEXT:    s_bfe_u32 s9, s1, 0xb0014
+; SI-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; SI-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; SI-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; SI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; SI-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; SI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; SI-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; SI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; SI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; SI-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; SI-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; SI-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; SI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; SI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; SI-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s0, 0x7c00
+; SI-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; SI-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT:    s_and_b32 s8, s6, 0x1ff
+; SI-SDAG-NEXT:    s_lshr_b32 s9, s6, 8
+; SI-SDAG-NEXT:    s_bfe_u32 s10, s6, 0xb0014
+; SI-SDAG-NEXT:    s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; SI-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; SI-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; SI-SDAG-NEXT:    s_or_b32 s1, s1, s7
+; SI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; SI-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; SI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; SI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; SI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; SI-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; SI-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; SI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; SI-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, s8
+; SI-SDAG-NEXT:    s_lshr_b32 s6, s6, 16
+; SI-SDAG-NEXT:    s_and_b32 s6, s6, 0x8000
+; SI-SDAG-NEXT:    s_or_b32 s0, s6, s0
+; SI-SDAG-NEXT:    s_and_b32 s0, s0, 0xffff
+; SI-SDAG-NEXT:    s_or_b32 s6, s0, s1
+; SI-SDAG-NEXT:    s_mov_b32 s0, s4
+; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-SDAG-NEXT:    s_endpgm
 ;
 ; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
@@ -654,6 +1890,1251 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_bfe_u32 s3, s5, 0xb0014
+; SI-GISEL-NEXT:    s_lshr_b32 s8, s5, 8
+; SI-GISEL-NEXT:    s_and_b32 s9, s5, 0x1ff
+; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
+; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT:    s_or_b32 s4, s9, s4
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT:    s_lshl_b32 s8, s8, 9
+; SI-GISEL-NEXT:    s_lshl_b32 s9, s3, 12
+; SI-GISEL-NEXT:    s_sub_i32 s10, 1, s3
+; SI-GISEL-NEXT:    s_or_b32 s11, s4, 0x1000
+; SI-GISEL-NEXT:    s_or_b32 s8, s8, 0x7c00
+; SI-GISEL-NEXT:    s_or_b32 s4, s4, s9
+; SI-GISEL-NEXT:    s_max_i32 s9, s10, 0
+; SI-GISEL-NEXT:    s_min_i32 s9, s9, 13
+; SI-GISEL-NEXT:    s_lshr_b32 s10, s11, s9
+; SI-GISEL-NEXT:    s_lshl_b32 s9, s10, s9
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s9, s11
+; SI-GISEL-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s9, s10, s9
+; SI-GISEL-NEXT:    s_cmp_lt_i32 s3, 1
+; SI-GISEL-NEXT:    s_cselect_b32 s4, s9, s4
+; SI-GISEL-NEXT:    s_and_b32 s9, s4, 7
+; SI-GISEL-NEXT:    s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT:    s_cmp_eq_u32 s9, 3
+; SI-GISEL-NEXT:    s_cselect_b32 s10, 1, 0
+; SI-GISEL-NEXT:    s_cmp_gt_i32 s9, 5
+; SI-GISEL-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s9, s10, s9
+; SI-GISEL-NEXT:    s_add_i32 s4, s4, s9
+; SI-GISEL-NEXT:    s_cmp_gt_i32 s3, 30
+; SI-GISEL-NEXT:    s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; SI-GISEL-NEXT:    s_cselect_b32 s3, s8, s4
+; SI-GISEL-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-GISEL-NEXT:    s_bfe_u32 s5, s7, 0xb0014
+; SI-GISEL-NEXT:    s_lshr_b32 s8, s7, 8
+; SI-GISEL-NEXT:    s_and_b32 s9, s7, 0x1ff
+; SI-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
+; SI-GISEL-NEXT:    s_addk_i32 s5, 0xfc10
+; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
+; SI-GISEL-NEXT:    s_or_b32 s6, s9, s6
+; SI-GISEL-NEXT:    s_or_b32 s3, s4, s3
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
+; SI-GISEL-NEXT:    s_lshl_b32 s8, s5, 12
+; SI-GISEL-NEXT:    s_sub_i32 s9, 1, s5
+; SI-GISEL-NEXT:    s_or_b32 s10, s4, 0x1000
+; SI-GISEL-NEXT:    s_or_b32 s6, s6, 0x7c00
+; SI-GISEL-NEXT:    s_or_b32 s4, s4, s8
+; SI-GISEL-NEXT:    s_max_i32 s8, s9, 0
+; SI-GISEL-NEXT:    s_min_i32 s8, s8, 13
+; SI-GISEL-NEXT:    s_lshr_b32 s9, s10, s8
+; SI-GISEL-NEXT:    s_lshl_b32 s8, s9, s8
+; SI-GISEL-NEXT:    s_cmp_lg_u32 s8, s10
+; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s8, s9, s8
+; SI-GISEL-NEXT:    s_cmp_lt_i32 s5, 1
+; SI-GISEL-NEXT:    s_cselect_b32 s4, s8, s4
+; SI-GISEL-NEXT:    s_and_b32 s8, s4, 7
+; SI-GISEL-NEXT:    s_lshr_b32 s4, s4, 2
+; SI-GISEL-NEXT:    s_cmp_eq_u32 s8, 3
+; SI-GISEL-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-GISEL-NEXT:    s_cmp_gt_i32 s8, 5
+; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-GISEL-NEXT:    s_or_b32 s8, s9, s8
+; SI-GISEL-NEXT:    s_add_i32 s4, s4, s8
+; SI-GISEL-NEXT:    s_cmp_gt_i32 s5, 30
+; SI-GISEL-NEXT:    s_cselect_b32 s4, 0x7c00, s4
+; SI-GISEL-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; SI-GISEL-NEXT:    s_cselect_b32 s4, s6, s4
+; SI-GISEL-NEXT:    s_lshr_b32 s5, s7, 16
+; SI-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
+; SI-GISEL-NEXT:    s_and_b32 s5, s5, 0x8000
+; SI-GISEL-NEXT:    s_or_b32 s4, s5, s4
+; SI-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-GISEL-NEXT:    s_or_b32 s4, s3, s4
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-GISEL-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-SDAG:       ; %bb.0: ; %entry
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SDAG-NEXT:    s_mov_b32 s10, s2
+; VI-SDAG-NEXT:    s_mov_b32 s11, s3
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s8, s6
+; VI-SDAG-NEXT:    s_mov_b32 s9, s7
+; VI-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; VI-SDAG-NEXT:    s_mov_b32 s0, s4
+; VI-SDAG-NEXT:    s_mov_b32 s1, s5
+; VI-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; VI-SDAG-NEXT:    s_and_b32 s7, s4, 0x1ff
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; VI-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; VI-SDAG-NEXT:    s_lshr_b32 s8, s4, 8
+; VI-SDAG-NEXT:    s_bfe_u32 s9, s4, 0xb0014
+; VI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; VI-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; VI-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; VI-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; VI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; VI-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; VI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; VI-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; VI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; VI-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; VI-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; VI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; VI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; VI-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s6, 0x7c00
+; VI-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; VI-SDAG-NEXT:    s_and_b32 s8, s5, 0x1ff
+; VI-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; VI-SDAG-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT:    s_lshr_b32 s9, s5, 8
+; VI-SDAG-NEXT:    s_bfe_u32 s10, s5, 0xb0014
+; VI-SDAG-NEXT:    s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; VI-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; VI-SDAG-NEXT:    s_or_b32 s4, s4, s7
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; VI-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; VI-SDAG-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; VI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; VI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; VI-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; VI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; VI-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT:    s_cselect_b32 s6, s6, 0x7c00
+; VI-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; VI-SDAG-NEXT:    s_cselect_b32 s6, s6, s8
+; VI-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; VI-SDAG-NEXT:    s_or_b32 s5, s5, s6
+; VI-SDAG-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-GISEL:       ; %bb.0: ; %entry
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    s_bfe_u32 s2, s5, 0xb0014
+; VI-GISEL-NEXT:    s_lshr_b32 s3, s5, 8
+; VI-GISEL-NEXT:    s_and_b32 s8, s5, 0x1ff
+; VI-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
+; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
+; VI-GISEL-NEXT:    s_or_b32 s4, s8, s4
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s3, s3, s4
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT:    s_sub_i32 s9, 1, s2
+; VI-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
+; VI-GISEL-NEXT:    s_max_i32 s9, s9, 0
+; VI-GISEL-NEXT:    s_or_b32 s8, s3, s8
+; VI-GISEL-NEXT:    s_min_i32 s9, s9, 13
+; VI-GISEL-NEXT:    s_bitset1_b32 s3, 12
+; VI-GISEL-NEXT:    s_lshl_b32 s4, s4, 9
+; VI-GISEL-NEXT:    s_lshr_b32 s10, s3, s9
+; VI-GISEL-NEXT:    s_or_b32 s4, s4, 0x7c00
+; VI-GISEL-NEXT:    s_lshl_b32 s9, s10, s9
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s9, s3
+; VI-GISEL-NEXT:    s_cselect_b32 s3, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s3, s10, s3
+; VI-GISEL-NEXT:    s_cmp_lt_i32 s2, 1
+; VI-GISEL-NEXT:    s_cselect_b32 s3, s3, s8
+; VI-GISEL-NEXT:    s_and_b32 s8, s3, 7
+; VI-GISEL-NEXT:    s_lshr_b32 s3, s3, 2
+; VI-GISEL-NEXT:    s_cmp_eq_u32 s8, 3
+; VI-GISEL-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s8, 5
+; VI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s8, s9, s8
+; VI-GISEL-NEXT:    s_add_i32 s3, s3, s8
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s2, 30
+; VI-GISEL-NEXT:    s_cselect_b32 s3, 0x7c00, s3
+; VI-GISEL-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; VI-GISEL-NEXT:    s_cselect_b32 s2, s4, s3
+; VI-GISEL-NEXT:    s_lshr_b32 s3, s5, 16
+; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
+; VI-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
+; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
+; VI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
+; VI-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT:    s_or_b32 s5, s5, s6
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s4, s4, s5
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT:    s_sub_i32 s8, 1, s3
+; VI-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
+; VI-GISEL-NEXT:    s_max_i32 s8, s8, 0
+; VI-GISEL-NEXT:    s_or_b32 s6, s4, s6
+; VI-GISEL-NEXT:    s_min_i32 s8, s8, 13
+; VI-GISEL-NEXT:    s_bitset1_b32 s4, 12
+; VI-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT:    s_lshr_b32 s9, s4, s8
+; VI-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT:    s_lshl_b32 s8, s9, s8
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s8, s4
+; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s4, s9, s4
+; VI-GISEL-NEXT:    s_cmp_lt_i32 s3, 1
+; VI-GISEL-NEXT:    s_cselect_b32 s4, s4, s6
+; VI-GISEL-NEXT:    s_and_b32 s6, s4, 7
+; VI-GISEL-NEXT:    s_lshr_b32 s4, s4, 2
+; VI-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; VI-GISEL-NEXT:    s_add_i32 s4, s4, s6
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s3, 30
+; VI-GISEL-NEXT:    s_cselect_b32 s4, 0x7c00, s4
+; VI-GISEL-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; VI-GISEL-NEXT:    s_cselect_b32 s3, s5, s4
+; VI-GISEL-NEXT:    s_lshr_b32 s4, s7, 16
+; VI-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
+; VI-GISEL-NEXT:    s_or_b32 s3, s4, s3
+; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-GISEL-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-GISEL-NEXT:    s_or_b32 s2, s2, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s3
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9-SDAG-NEXT:    s_and_b32 s7, s5, 0x1ff
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s8, s5, 8
+; GFX9-SDAG-NEXT:    s_bfe_u32 s9, s5, 0xb0014
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; GFX9-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; GFX9-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX9-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX9-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; GFX9-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s4, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s6, 0x1ff
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s9, s6, 8
+; GFX9-SDAG-NEXT:    s_bfe_u32 s10, s6, 0xb0014
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; GFX9-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; GFX9-SDAG-NEXT:    s_or_b32 s5, s5, s7
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; GFX9-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX9-SDAG-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s6, 0x8000
+; GFX9-SDAG-NEXT:    s_or_b32 s4, s6, s4
+; GFX9-SDAG-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_bfe_u32 s2, s5, 0xb0014
+; GFX9-GISEL-NEXT:    s_lshr_b32 s3, s5, 8
+; GFX9-GISEL-NEXT:    s_and_b32 s8, s5, 0x1ff
+; GFX9-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX9-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s8, s4
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s3, s4
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT:    s_sub_i32 s9, 1, s2
+; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
+; GFX9-GISEL-NEXT:    s_max_i32 s9, s9, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s8, s3, s8
+; GFX9-GISEL-NEXT:    s_min_i32 s9, s9, 13
+; GFX9-GISEL-NEXT:    s_bitset1_b32 s3, 12
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 9
+; GFX9-GISEL-NEXT:    s_lshr_b32 s10, s3, s9
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, 0x7c00
+; GFX9-GISEL-NEXT:    s_lshl_b32 s9, s10, s9
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s9, s3
+; GFX9-GISEL-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s10, s3
+; GFX9-GISEL-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX9-GISEL-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX9-GISEL-NEXT:    s_and_b32 s8, s3, 7
+; GFX9-GISEL-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX9-GISEL-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-GISEL-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX9-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s8, s9, s8
+; GFX9-GISEL-NEXT:    s_add_i32 s3, s3, s8
+; GFX9-GISEL-NEXT:    s_cmp_gt_i32 s2, 30
+; GFX9-GISEL-NEXT:    s_cselect_b32 s3, 0x7c00, s3
+; GFX9-GISEL-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX9-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
+; GFX9-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT:    s_or_b32 s5, s5, s6
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, s5
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-GISEL-NEXT:    s_sub_i32 s8, 1, s3
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
+; GFX9-GISEL-NEXT:    s_max_i32 s8, s8, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s6, s4, s6
+; GFX9-GISEL-NEXT:    s_min_i32 s8, s8, 13
+; GFX9-GISEL-NEXT:    s_bitset1_b32 s4, 12
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX9-GISEL-NEXT:    s_lshr_b32 s9, s4, s8
+; GFX9-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s8, s4
+; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s9, s4
+; GFX9-GISEL-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX9-GISEL-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX9-GISEL-NEXT:    s_and_b32 s6, s4, 7
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s4, 2
+; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX9-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX9-GISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX9-GISEL-NEXT:    s_cmp_gt_i32 s3, 30
+; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 0x7c00, s4
+; GFX9-GISEL-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX9-GISEL-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s7, 16
+; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s4, s3
+; GFX9-GISEL-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-SDAG:       ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT:    s_mov_b32 s7, s3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX950-SDAG-NEXT:    s_and_b32 s7, s5, 0x1ff
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; GFX950-SDAG-NEXT:    s_lshr_b32 s8, s5, 8
+; GFX950-SDAG-NEXT:    s_bfe_u32 s9, s5, 0xb0014
+; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; GFX950-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX950-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; GFX950-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX950-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX950-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; GFX950-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s4, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; GFX950-SDAG-NEXT:    s_and_b32 s8, s6, 0x1ff
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s9, s6, 8
+; GFX950-SDAG-NEXT:    s_bfe_u32 s10, s6, 0xb0014
+; GFX950-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; GFX950-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; GFX950-SDAG-NEXT:    s_or_b32 s5, s5, s7
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX950-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; GFX950-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; GFX950-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX950-SDAG-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX950-SDAG-NEXT:    s_and_b32 s6, s6, 0x8000
+; GFX950-SDAG-NEXT:    s_or_b32 s4, s6, s4
+; GFX950-SDAG-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-GISEL:       ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_bfe_u32 s2, s5, 0xb0014
+; GFX950-GISEL-NEXT:    s_lshr_b32 s3, s5, 8
+; GFX950-GISEL-NEXT:    s_and_b32 s8, s5, 0x1ff
+; GFX950-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
+; GFX950-GISEL-NEXT:    s_or_b32 s4, s8, s4
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s3, s3, s4
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT:    s_sub_i32 s9, 1, s2
+; GFX950-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
+; GFX950-GISEL-NEXT:    s_max_i32 s9, s9, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s8, s3, s8
+; GFX950-GISEL-NEXT:    s_min_i32 s9, s9, 13
+; GFX950-GISEL-NEXT:    s_bitset1_b32 s3, 12
+; GFX950-GISEL-NEXT:    s_lshl_b32 s4, s4, 9
+; GFX950-GISEL-NEXT:    s_lshr_b32 s10, s3, s9
+; GFX950-GISEL-NEXT:    s_or_b32 s4, s4, 0x7c00
+; GFX950-GISEL-NEXT:    s_lshl_b32 s9, s10, s9
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s9, s3
+; GFX950-GISEL-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s3, s10, s3
+; GFX950-GISEL-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX950-GISEL-NEXT:    s_cselect_b32 s3, s3, s8
+; GFX950-GISEL-NEXT:    s_and_b32 s8, s3, 7
+; GFX950-GISEL-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX950-GISEL-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX950-GISEL-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX950-GISEL-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX950-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s8, s9, s8
+; GFX950-GISEL-NEXT:    s_add_i32 s3, s3, s8
+; GFX950-GISEL-NEXT:    s_cmp_gt_i32 s2, 30
+; GFX950-GISEL-NEXT:    s_cselect_b32 s3, 0x7c00, s3
+; GFX950-GISEL-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; GFX950-GISEL-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX950-GISEL-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX950-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX950-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
+; GFX950-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
+; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
+; GFX950-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT:    s_or_b32 s5, s5, s6
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s4, s4, s5
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX950-GISEL-NEXT:    s_sub_i32 s8, 1, s3
+; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
+; GFX950-GISEL-NEXT:    s_max_i32 s8, s8, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s6, s4, s6
+; GFX950-GISEL-NEXT:    s_min_i32 s8, s8, 13
+; GFX950-GISEL-NEXT:    s_bitset1_b32 s4, 12
+; GFX950-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX950-GISEL-NEXT:    s_lshr_b32 s9, s4, s8
+; GFX950-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX950-GISEL-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s8, s4
+; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s4, s9, s4
+; GFX950-GISEL-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX950-GISEL-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX950-GISEL-NEXT:    s_and_b32 s6, s4, 7
+; GFX950-GISEL-NEXT:    s_lshr_b32 s4, s4, 2
+; GFX950-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX950-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX950-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX950-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX950-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX950-GISEL-NEXT:    s_add_i32 s4, s4, s6
+; GFX950-GISEL-NEXT:    s_cmp_gt_i32 s3, 30
+; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 0x7c00, s4
+; GFX950-GISEL-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX950-GISEL-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX950-GISEL-NEXT:    s_lshr_b32 s4, s7, 16
+; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX950-GISEL-NEXT:    s_or_b32 s3, s4, s3
+; GFX950-GISEL-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX950-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GFX950-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-TRUE16-NEXT:    s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s3, s12, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s10, s9, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s10, s3, 7
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_i32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-SDAG-FAKE16-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-FAKE16-NEXT:    s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s3, s12, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s10, s9, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s10, s3, 7
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_i32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SDAG-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s8, 1, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
+; GFX11-GISEL-TRUE16-NEXT:    s_max_i32 s8, s8, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s9, s2, 12
+; GFX11-GISEL-TRUE16-NEXT:    s_min_i32 s8, s8, 13
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s4, s4, 9
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s11, s10, s8
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s9
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s8, s11, s8
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s4, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s8, s10
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s8, s11, s8
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s3, 7
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-GISEL-TRUE16-NEXT:    s_add_i32 s3, s3, s8
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s2, 30
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
+; GFX11-GISEL-TRUE16-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s8, s4, 12
+; GFX11-GISEL-TRUE16-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s10, s9, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s8
+; GFX11-GISEL-TRUE16-NEXT:    s_lshl_b32 s6, s10, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, s9
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s10, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 7
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_add_i32 s3, s3, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s4, s7, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s8, 1, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
+; GFX11-GISEL-FAKE16-NEXT:    s_max_i32 s8, s8, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s9, s2, 12
+; GFX11-GISEL-FAKE16-NEXT:    s_min_i32 s8, s8, 13
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s4, s4, 9
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s11, s10, s8
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s9
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s8, s11, s8
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s4, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s8, s10
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s8, s11, s8
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s3, 7
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-GISEL-FAKE16-NEXT:    s_add_i32 s3, s3, s8
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s2, 30
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
+; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
+; GFX11-GISEL-FAKE16-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s8, s4, 12
+; GFX11-GISEL-FAKE16-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s10, s9, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s8
+; GFX11-GISEL-FAKE16-NEXT:    s_lshl_b32 s6, s10, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, s9
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s10, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 7
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_add_i32 s3, s3, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 0x7c00, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s4, s7, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s3
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a) {
+entry:
+  %a.val = load <2 x double>, ptr addrspace(1) %a
+  %r.val = fptrunc <2 x double> %a.val to <2 x half>
+  store <2 x half> %r.val, ptr addrspace(1) %r
+  ret void
+}
+
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
+; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
+; SI-SDAG:       ; %bb.0: ; %entry
+; SI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SI-SDAG-NEXT:    s_mov_b32 s2, -1
+; SI-SDAG-NEXT:    s_mov_b32 s10, s2
+; SI-SDAG-NEXT:    s_mov_b32 s11, s3
+; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s8, s6
+; SI-SDAG-NEXT:    s_mov_b32 s9, s7
+; SI-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-SDAG-NEXT:    s_movk_i32 s0, 0x7e00
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; SI-SDAG-NEXT:    s_and_b32 s7, s1, 0x1ff
+; SI-SDAG-NEXT:    s_lshr_b32 s8, s1, 8
+; SI-SDAG-NEXT:    s_bfe_u32 s9, s1, 0xb0014
+; SI-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; SI-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; SI-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; SI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; SI-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; SI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; SI-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; SI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; SI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; SI-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; SI-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; SI-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; SI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; SI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; SI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; SI-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s0, 0x7c00
+; SI-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; SI-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; SI-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; SI-SDAG-NEXT:    s_and_b32 s8, s6, 0x1ff
+; SI-SDAG-NEXT:    s_lshr_b32 s9, s6, 8
+; SI-SDAG-NEXT:    s_bfe_u32 s10, s6, 0xb0014
+; SI-SDAG-NEXT:    s_and_b32 s1, s1, 0x8000
+; SI-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; SI-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; SI-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; SI-SDAG-NEXT:    s_or_b32 s1, s1, s7
+; SI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; SI-SDAG-NEXT:    s_lshl_b32 s1, s1, 16
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; SI-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; SI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; SI-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; SI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; SI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; SI-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; SI-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; SI-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; SI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; SI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; SI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; SI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; SI-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; SI-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; SI-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; SI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; SI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; SI-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; SI-SDAG-NEXT:    s_cselect_b32 s0, s0, s8
+; SI-SDAG-NEXT:    s_lshr_b32 s6, s6, 16
+; SI-SDAG-NEXT:    s_and_b32 s6, s6, 0x8000
+; SI-SDAG-NEXT:    s_or_b32 s0, s6, s0
+; SI-SDAG-NEXT:    s_and_b32 s0, s0, 0xffff
+; SI-SDAG-NEXT:    s_or_b32 s6, s0, s1
+; SI-SDAG-NEXT:    s_mov_b32 s0, s4
+; SI-SDAG-NEXT:    s_mov_b32 s1, s5
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; SI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-SDAG-NEXT:    s_endpgm
+;
+; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
+; SI-GISEL:       ; %bb.0: ; %entry
+; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; SI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -664,29 +3145,111 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
-; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; VI-SDAG:       ; %bb.0: ; %entry
-; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT:    s_mov_b32 s6, -1
-; VI-SDAG-NEXT:    s_mov_b32 s10, s6
-; VI-SDAG-NEXT:    s_mov_b32 s11, s7
+; VI-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SDAG-NEXT:    s_mov_b32 s10, s2
+; VI-SDAG-NEXT:    s_mov_b32 s11, s3
 ; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT:    s_mov_b32 s8, s2
-; VI-SDAG-NEXT:    s_mov_b32 s9, s3
+; VI-SDAG-NEXT:    s_mov_b32 s8, s6
+; VI-SDAG-NEXT:    s_mov_b32 s9, s7
 ; VI-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-SDAG-NEXT:    s_mov_b32 s4, s0
-; VI-SDAG-NEXT:    s_mov_b32 s5, s1
+; VI-SDAG-NEXT:    s_mov_b32 s0, s4
+; VI-SDAG-NEXT:    s_mov_b32 s1, s5
+; VI-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
-; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; VI-SDAG-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
+; VI-SDAG-NEXT:    s_and_b32 s7, s4, 0x1ff
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; VI-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; VI-SDAG-NEXT:    s_lshr_b32 s8, s4, 8
+; VI-SDAG-NEXT:    s_bfe_u32 s9, s4, 0xb0014
+; VI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; VI-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; VI-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; VI-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; VI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; VI-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; VI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; VI-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; VI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; VI-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; VI-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; VI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; VI-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; VI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; VI-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s6, 0x7c00
+; VI-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; VI-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; VI-SDAG-NEXT:    s_and_b32 s8, s5, 0x1ff
+; VI-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; VI-SDAG-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-SDAG-NEXT:    s_lshr_b32 s9, s5, 8
+; VI-SDAG-NEXT:    s_bfe_u32 s10, s5, 0xb0014
+; VI-SDAG-NEXT:    s_and_b32 s4, s4, 0x8000
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; VI-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; VI-SDAG-NEXT:    s_or_b32 s4, s4, s7
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; VI-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; VI-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; VI-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; VI-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; VI-SDAG-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; VI-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; VI-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; VI-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; VI-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; VI-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; VI-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; VI-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; VI-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; VI-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; VI-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; VI-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; VI-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; VI-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; VI-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; VI-SDAG-NEXT:    s_cselect_b32 s6, s6, 0x7c00
+; VI-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; VI-SDAG-NEXT:    s_cselect_b32 s6, s6, s8
+; VI-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; VI-SDAG-NEXT:    s_or_b32 s5, s5, s6
+; VI-SDAG-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-SDAG-NEXT:    s_endpgm
 ;
-; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; VI-GISEL:       ; %bb.0: ; %entry
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -702,29 +3265,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, s3
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s3
-; GFX9-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX9-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX9-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9-SDAG-NEXT:    s_and_b32 s7, s5, 0x1ff
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s8, s5, 8
+; GFX9-SDAG-NEXT:    s_bfe_u32 s9, s5, 0xb0014
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; GFX9-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; GFX9-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX9-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX9-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; GFX9-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s4, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; GFX9-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s6, 0x1ff
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s9, s6, 8
+; GFX9-SDAG-NEXT:    s_bfe_u32 s10, s6, 0xb0014
+; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; GFX9-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; GFX9-SDAG-NEXT:    s_or_b32 s5, s5, s7
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX9-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX9-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; GFX9-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX9-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; GFX9-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX9-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX9-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX9-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; GFX9-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; GFX9-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX9-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; GFX9-SDAG-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX9-SDAG-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s6, 0x8000
+; GFX9-SDAG-NEXT:    s_or_b32 s4, s6, s4
+; GFX9-SDAG-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -740,27 +3383,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX950-SDAG:       ; %bb.0: ; %entry
-; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX950-SDAG-NEXT:    s_mov_b32 s7, 0xf000
-; GFX950-SDAG-NEXT:    s_mov_b32 s6, -1
-; GFX950-SDAG-NEXT:    s_mov_b32 s10, s6
-; GFX950-SDAG-NEXT:    s_mov_b32 s11, s7
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT:    s_mov_b32 s6, s2
+; GFX950-SDAG-NEXT:    s_mov_b32 s7, s3
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    s_mov_b32 s8, s2
-; GFX950-SDAG-NEXT:    s_mov_b32 s9, s3
-; GFX950-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX950-SDAG-NEXT:    s_mov_b32 s4, s0
-; GFX950-SDAG-NEXT:    s_mov_b32 s5, s1
+; GFX950-SDAG-NEXT:    s_mov_b32 s4, s10
+; GFX950-SDAG-NEXT:    s_mov_b32 s5, s11
+; GFX950-SDAG-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, s8
+; GFX950-SDAG-NEXT:    s_mov_b32 s1, s9
+; GFX950-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX950-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX950-SDAG-NEXT:    v_cvt_pk_f16_f32 v0, v0, v2
-; GFX950-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX950-SDAG-NEXT:    s_and_b32 s7, s5, 0x1ff
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v1, s7, v2
+; GFX950-SDAG-NEXT:    s_lshr_b32 s8, s5, 8
+; GFX950-SDAG-NEXT:    s_bfe_u32 s9, s5, 0xb0014
+; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX950-SDAG-NEXT:    s_and_b32 s7, s8, 0xffe
+; GFX950-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s9
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_med3_i32 v2, s8, 0, 13
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s7, s8
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT:    s_lshr_b32 s11, s8, s10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s10, s11, s10
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s10, s8
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT:    s_addk_i32 s9, 0xfc10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s10, s9, 12
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT:    s_or_b32 s10, s7, s10
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s9, 1
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX950-SDAG-NEXT:    s_and_b32 s10, s8, 7
+; GFX950-SDAG-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX950-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX950-SDAG-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT:    s_or_b32 s10, s10, s11
+; GFX950-SDAG-NEXT:    s_add_i32 s8, s8, s10
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s9, 31
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s4, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; GFX950-SDAG-NEXT:    s_cselect_b32 s7, s7, s8
+; GFX950-SDAG-NEXT:    s_and_b32 s8, s6, 0x1ff
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s9, s6, 8
+; GFX950-SDAG-NEXT:    s_bfe_u32 s10, s6, 0xb0014
+; GFX950-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX950-SDAG-NEXT:    s_and_b32 s8, s9, 0xffe
+; GFX950-SDAG-NEXT:    s_sub_i32 s9, 0x3f1, s10
+; GFX950-SDAG-NEXT:    s_or_b32 s5, s5, s7
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX950-SDAG-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX950-SDAG-NEXT:    s_or_b32 s7, s8, s7
+; GFX950-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s7, 0x1000
+; GFX950-SDAG-NEXT:    s_lshr_b32 s11, s8, s9
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s11, s9
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s9, s8
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX950-SDAG-NEXT:    s_addk_i32 s10, 0xfc10
+; GFX950-SDAG-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX950-SDAG-NEXT:    s_or_b32 s8, s11, s8
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s7, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, s9
+; GFX950-SDAG-NEXT:    s_and_b32 s9, s8, 7
+; GFX950-SDAG-NEXT:    s_cmp_gt_i32 s9, 5
+; GFX950-SDAG-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX950-SDAG-NEXT:    s_cmp_eq_u32 s9, 3
+; GFX950-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX950-SDAG-NEXT:    s_lshr_b32 s8, s8, 2
+; GFX950-SDAG-NEXT:    s_or_b32 s9, s9, s11
+; GFX950-SDAG-NEXT:    s_add_i32 s8, s8, s9
+; GFX950-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; GFX950-SDAG-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX950-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; GFX950-SDAG-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX950-SDAG-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX950-SDAG-NEXT:    s_and_b32 s6, s6, 0x8000
+; GFX950-SDAG-NEXT:    s_or_b32 s4, s6, s4
+; GFX950-SDAG-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX950-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
-; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX950-GISEL:       ; %bb.0: ; %entry
 ; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -776,7 +3501,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX950-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
@@ -786,21 +3511,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s9, s3
-; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v1, v[0:1]
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-TRUE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-TRUE16-NEXT:    s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s3, s12, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s10, s9, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s10, s3, 7
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_i32 s3, s3, s10
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-TRUE16-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-TRUE16-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
@@ -810,21 +3627,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s9, s3
-; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s4, s0
 ; GFX11-SDAG-FAKE16-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s3, s2, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s2, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, s3, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SDAG-FAKE16-NEXT:    s_sub_i32 s4, 0x3f1, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v3, s4, 0, 13
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s9, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s9, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s8, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s5, s9, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s8, s3, 12
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s4, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s8, s5, 7
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s9, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s9, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s5, s4, 0x1ff
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s10, s4, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, s5, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_bfe_u32 s5, s4, 0xb0014
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s10, s10, 0xffe
+; GFX11-SDAG-FAKE16-NEXT:    s_sub_i32 s9, 0x3f1, s5
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, s9, 0, 13
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s2, s2, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s10, s9, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s12, s10, s11
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s11, s12, s11
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s11, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_addk_i32 s5, 0xfc10
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s3, s12, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_lshl_b32 s10, s5, 12
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s10, s9, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s10, s3, 7
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_gt_i32 s10, 5
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_eq_u32 s10, 3
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_i32 s3, s3, s10
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s8, s8, 0x7c00
+; GFX11-SDAG-FAKE16-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-SDAG-FAKE16-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_lshr_b32 s4, s4, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    s_and_b32 s4, s4, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_or_b32 s3, s4, s3
+; GFX11-SDAG-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-SDAG-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -842,7 +3751,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -863,7 +3772,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
     ptr addrspace(1) %a) {
 entry:
   %a.val = load <2 x double>, ptr addrspace(1) %a
-  %r.val = fptrunc <2 x double> %a.val to <2 x half>
+  %r.val = fptrunc afn <2 x double> %a.val to <2 x half>
   store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 2bd3659..4f8eab1 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -3,17 +3,15 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-GISEL,VI-UNSAFE-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-UNSAFE-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
 
 define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) {
 ; SI-LABEL: fptrunc_f64_to_f32:
@@ -94,6 +92,85 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
   ret void
 }
 
+define amdgpu_kernel void @fptrunc_f64_to_f32_afn(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fptrunc_f64_to_f32_afn:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s6, -1
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-SDAG-NEXT:    s_mov_b32 s4, s0
+; VI-SDAG-NEXT:    s_mov_b32 s5, s1
+; VI-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_f64_to_f32_afn:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_f64_to_f32_afn:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT:    s_endpgm
+  %result = fptrunc afn double %in to float
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) {
 ; SI-LABEL: fptrunc_f64_to_f16:
 ; SI:       ; %bb.0:
@@ -203,56 +280,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-SAFE-SDAG-NEXT:    s_endpgm
 ;
-; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; VI-SAFE-GISEL:       ; %bb.0:
-; VI-SAFE-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-SAFE-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-SAFE-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
-; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; VI-SAFE-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
-; VI-SAFE-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
-; VI-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; VI-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
-; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
-; VI-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
-; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s2, s6
-; VI-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
-; VI-SAFE-GISEL-NEXT:    s_bitset1_b32 s2, 12
-; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
-; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
-; VI-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
-; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
-; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s8, s2
-; VI-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
-; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
-; VI-SAFE-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
-; VI-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
-; VI-SAFE-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
-; VI-SAFE-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
-; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s3, s2
-; VI-SAFE-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-SAFE-GISEL-NEXT:    s_mov_b32 s2, -1
-; VI-SAFE-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; VI-SAFE-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; VI-SAFE-GISEL-NEXT:    s_endpgm
+; VI-GISEL-LABEL: fptrunc_f64_to_f16:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; VI-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
+; VI-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
+; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
+; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
+; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
+; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
+; VI-GISEL-NEXT:    s_max_i32 s7, s7, 0
+; VI-GISEL-NEXT:    s_or_b32 s6, s2, s6
+; VI-GISEL-NEXT:    s_min_i32 s7, s7, 13
+; VI-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; VI-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; VI-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; VI-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; VI-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
+; VI-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
+; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s2, s8, s2
+; VI-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; VI-GISEL-NEXT:    s_cselect_b32 s2, s2, s6
+; VI-GISEL-NEXT:    s_and_b32 s6, s2, 7
+; VI-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
+; VI-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; VI-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; VI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; VI-GISEL-NEXT:    s_or_b32 s6, s7, s6
+; VI-GISEL-NEXT:    s_add_i32 s2, s2, s6
+; VI-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; VI-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; VI-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; VI-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
+; VI-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
+; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; VI-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
 ; VI-UNSAFE-SDAG:       ; %bb.0:
@@ -265,17 +342,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-UNSAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-UNSAFE-SDAG-NEXT:    s_endpgm
 ;
-; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; VI-UNSAFE-GISEL:       ; %bb.0:
-; VI-UNSAFE-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-UNSAFE-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-GISEL-NEXT:    s_mov_b32 s2, -1
-; VI-UNSAFE-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; VI-UNSAFE-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; VI-UNSAFE-GISEL-NEXT:    s_endpgm
-;
 ; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
 ; GFX10-SAFE-SDAG:       ; %bb.0:
 ; GFX10-SAFE-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -328,56 +394,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-SAFE-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; GFX10-SAFE-GISEL:       ; %bb.0:
-; GFX10-SAFE-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-SAFE-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
-; GFX10-SAFE-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
-; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
-; GFX10-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
-; GFX10-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
-; GFX10-SAFE-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s3, s2
-; GFX10-SAFE-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SAFE-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX10-SAFE-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX10-SAFE-GISEL-NEXT:    s_endpgm
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX10-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX10-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX10-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-GISEL-NEXT:    s_sub_i32 s6, 1, s4
+; GFX10-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
+; GFX10-GISEL-NEXT:    s_max_i32 s6, s6, 0
+; GFX10-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
+; GFX10-GISEL-NEXT:    s_min_i32 s6, s6, 13
+; GFX10-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX10-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX10-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX10-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
+; GFX10-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-GISEL-NEXT:    s_or_b32 s6, s9, s6
+; GFX10-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX10-GISEL-NEXT:    s_and_b32 s6, s2, 7
+; GFX10-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX10-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX10-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX10-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-GISEL-NEXT:    s_or_b32 s6, s7, s6
+; GFX10-GISEL-NEXT:    s_add_i32 s2, s2, s6
+; GFX10-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX10-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX10-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX10-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
 ; GFX10-UNSAFE-SDAG:       ; %bb.0:
@@ -390,17 +456,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-UNSAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-UNSAFE-SDAG-NEXT:    s_endpgm
 ;
-; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
-; GFX10-UNSAFE-GISEL:       ; %bb.0:
-; GFX10-UNSAFE-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10-UNSAFE-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX10-UNSAFE-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-UNSAFE-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
-; GFX10-UNSAFE-GISEL-NEXT:    s_endpgm
-;
 ; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
 ; GFX11-SAFE-SDAG:       ; %bb.0:
 ; GFX11-SAFE-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -461,62 +516,368 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-SAFE-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX11-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-GISEL-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
+; GFX11-GISEL-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
+; GFX11-GISEL-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
+; GFX11-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s6, s9, s6
+; GFX11-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-GISEL-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-GISEL-NEXT:    s_add_i32 s2, s2, s6
+; GFX11-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX11-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX11-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-TRUE16:       ; %bb.0:
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-FAKE16:       ; %bb.0:
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_endpgm
+  %result = fptrunc double %in to half
+  %result_i16 = bitcast half %result to i16
+  store i16 %result_i16, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fptrunc_f64_to_f16_afn(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fptrunc_f64_to_f16_afn:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_movk_i32 s2, 0x7e00
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s0, s7, 8
+; SI-NEXT:    s_and_b32 s1, s7, 0x1ff
+; SI-NEXT:    s_and_b32 s8, s0, 0xffe
+; SI-NEXT:    s_or_b32 s0, s1, s6
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
+; SI-NEXT:    v_readfirstlane_b32 s1, v0
+; SI-NEXT:    s_sub_i32 s6, 0x3f1, s0
+; SI-NEXT:    s_or_b32 s1, s8, s1
+; SI-NEXT:    v_med3_i32 v0, s6, 0, 13
+; SI-NEXT:    s_or_b32 s6, s1, 0x1000
+; SI-NEXT:    v_readfirstlane_b32 s8, v0
+; SI-NEXT:    s_lshr_b32 s9, s6, s8
+; SI-NEXT:    s_lshl_b32 s8, s9, s8
+; SI-NEXT:    s_cmp_lg_u32 s8, s6
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_addk_i32 s0, 0xfc10
+; SI-NEXT:    s_or_b32 s6, s9, s6
+; SI-NEXT:    s_lshl_b32 s8, s0, 12
+; SI-NEXT:    s_or_b32 s8, s1, s8
+; SI-NEXT:    s_cmp_lt_i32 s0, 1
+; SI-NEXT:    s_cselect_b32 s6, s6, s8
+; SI-NEXT:    s_and_b32 s8, s6, 7
+; SI-NEXT:    s_cmp_gt_i32 s8, 5
+; SI-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 3
+; SI-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-NEXT:    s_lshr_b32 s6, s6, 2
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_add_i32 s6, s6, s8
+; SI-NEXT:    s_cmp_lt_i32 s0, 31
+; SI-NEXT:    s_cselect_b32 s6, s6, 0x7c00
+; SI-NEXT:    s_cmp_lg_u32 s1, 0
+; SI-NEXT:    s_cselect_b32 s1, s2, 0x7c00
+; SI-NEXT:    s_cmpk_eq_i32 s0, 0x40f
+; SI-NEXT:    s_cselect_b32 s0, s1, s6
+; SI-NEXT:    s_lshr_b32 s1, s7, 16
+; SI-NEXT:    s_and_b32 s1, s1, 0x8000
+; SI-NEXT:    s_or_b32 s6, s1, s0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-SAFE-SDAG:       ; %bb.0:
+; VI-SAFE-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SAFE-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s0, s4
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s7, 8
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s4, 0xffe
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s4, s7, 0x1ff
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s6
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT:    s_mov_b32 s1, s5
+; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s6, s7, 0xb0014
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s8, s4
+; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s6
+; VI-SAFE-SDAG-NEXT:    v_med3_i32 v0, s8, 0, 13
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
+; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s9, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s9, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s8, s5
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_addk_i32 s6, 0xfc10
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s6, 12
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s9, s5
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s4, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 1
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s5, 7
+; VI-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s8, 5
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s8, 3
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s8, s9
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; VI-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 31
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s6, 0x40f
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s5
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s7, 16
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; VI-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; VI-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-SAFE-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; VI-UNSAFE-SDAG:       ; %bb.0:
+; VI-UNSAFE-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-UNSAFE-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-UNSAFE-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; VI-UNSAFE-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-UNSAFE-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-UNSAFE-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-UNSAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-SDAG-NEXT:    s_endpgm
+;
+; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-SAFE-SDAG:       ; %bb.0:
+; GFX10-SAFE-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-SAFE-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
+; GFX10-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
+; GFX10-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
+; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
+; GFX10-SAFE-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s6, 0x7c00
+; GFX10-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, s4, s5
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX10-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX10-UNSAFE-SDAG:       ; %bb.0:
+; GFX10-UNSAFE-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-UNSAFE-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-UNSAFE-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
+; GFX10-UNSAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-UNSAFE-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX10-UNSAFE-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-UNSAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-SDAG-NEXT:    s_endpgm
+;
+; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn:
+; GFX11-SAFE-SDAG:       ; %bb.0:
+; GFX11-SAFE-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SAFE-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
+; GFX11-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
+; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
+; GFX11-SAFE-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s6, 0x7c00
+; GFX11-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, s4, s5
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX11-SAFE-SDAG-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-SDAG-NEXT:    s_endpgm
+;
+; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX11-SAFE-GISEL:       ; %bb.0:
 ; GFX11-SAFE-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SAFE-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
-; GFX11-SAFE-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
-; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
-; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
-; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
-; GFX11-SAFE-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-SAFE-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-SAFE-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
 ; GFX11-SAFE-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-SAFE-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-GISEL-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-SAFE-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-SAFE-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX11-UNSAFE-DAG-TRUE16:       ; %bb.0:
 ; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -528,7 +889,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-UNSAFE-DAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-UNSAFE-DAG-TRUE16-NEXT:    s_endpgm
 ;
-; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX11-UNSAFE-DAG-FAKE16:       ; %bb.0:
 ; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -540,7 +901,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-UNSAFE-DAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-UNSAFE-DAG-FAKE16-NEXT:    s_endpgm
 ;
-; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX11-UNSAFE-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -552,7 +913,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_endpgm
 ;
-; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX11-UNSAFE-GISEL-FAKE16:       ; %bb.0:
 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -563,7 +924,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_endpgm
-  %result = fptrunc double %in to half
+  %result = fptrunc afn double %in to half
   %result_i16 = bitcast half %result to i16
   store i16 %result_i16, ptr addrspace(1) %out
   ret void
@@ -662,6 +1023,99 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
   ret void
 }
 
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32_afn(ptr addrspace(1) %out, <2 x double> %in) {
+; SI-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; SI-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s6, -1
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_mov_b32 s6, -1
+; VI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-GISEL-NEXT:    s_endpgm
+  %result = fptrunc afn <2 x double> %in to <2 x float>
+  store <2 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) {
 ; SI-LABEL: fptrunc_v3f64_to_v3f32:
 ; SI:       ; %bb.0:
@@ -769,6 +1223,113 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
   ret void
 }
 
+define amdgpu_kernel void @fptrunc_v3f64_to_v3f32_afn(ptr addrspace(1) %out, <3 x double> %in) {
+; SI-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x15
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT:    v_cvt_f32_f64_e32 v2, s[4:5]
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
+; VI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
+; VI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[6:7]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s6, -1
+; VI-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x54
+; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[6:7]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x54
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[6:7]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[2:3]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-GISEL-NEXT:    s_endpgm
+  %result = fptrunc afn <3 x double> %in to <3 x float>
+  store <3 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) {
 ; SI-LABEL: fptrunc_v4f64_to_v4f32:
 ; SI:       ; %bb.0:
@@ -876,6 +1437,113 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
   ret void
 }
 
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32_afn(ptr addrspace(1) %out, <4 x double> %in) {
+; SI-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; SI-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; SI-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; VI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-SDAG-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b256 s[8:15], s[4:5], 0x44
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-GISEL-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT:    s_endpgm
+  %result = fptrunc afn <4 x double> %in to <4 x float>
+  store <4 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) {
 ; SI-LABEL: fptrunc_v8f64_to_v8f32:
 ; SI:       ; %bb.0:
@@ -1019,3 +1687,150 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
   store <8 x float> %result, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32_afn(ptr addrspace(1) %out, <8 x double> %in) {
+; SI-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; SI-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; SI-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; SI-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; SI-NEXT:    v_cvt_f32_f64_e32 v7, s[22:23]
+; SI-NEXT:    v_cvt_f32_f64_e32 v6, s[20:21]
+; SI-NEXT:    v_cvt_f32_f64_e32 v5, s[18:19]
+; SI-NEXT:    v_cvt_f32_f64_e32 v4, s[16:17]
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT:    s_mov_b32 s2, -1
+; VI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v7, s[22:23]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v6, s[20:21]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v5, s[18:19]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v4, s[16:17]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT:    s_endpgm
+;
+; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-GISEL-NEXT:    s_mov_b32 s2, -1
+; VI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v4, s[16:17]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v5, s[18:19]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v6, s[20:21]
+; VI-GISEL-NEXT:    v_cvt_f32_f64_e32 v7, s[22:23]
+; VI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-GISEL-NEXT:    s_endpgm
+;
+; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX10-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX10-GISEL-NEXT:    v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX10-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0x64
+; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-SDAG-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0x64
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[10:11]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v2, s[12:13]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v3, s[14:15]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v4, s[16:17]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v5, s[18:19]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v6, s[20:21]
+; GFX11-GISEL-NEXT:    v_cvt_f32_f64_e32 v7, s[22:23]
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-GISEL-NEXT:    s_endpgm
+  %result = fptrunc <8 x double> %in to <8 x float>
+  store <8 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10-SAFE-GISEL: {{.*}}
+; VI-SAFE-GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
new file mode 100644
index 0000000..d13d76f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 -mattr=-cu-stores < %s | FileCheck --check-prefixes=GCN,NOCU %s
+
+; Check that if -cu-stores is used, we use SCOPE_SE minimum on all stores.
+
+; GCN:     flat_store:
+; CU:        flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU:      flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN:     .amdhsa_kernel flat_store
+; CU:       .amdhsa_uses_cu_stores 1
+; NOCU:     .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @flat_store(ptr %dst, i32 %val) {
+entry:
+  store i32 %val, ptr %dst
+  ret void
+}
+
+; GCN:     global_store:
+; CU:        global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
+; NOCU:      global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN:     .amdhsa_kernel global_store
+; CU:        .amdhsa_uses_cu_stores 1
+; NOCU:      .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @global_store(ptr addrspace(1) %dst, i32 %val) {
+entry:
+  store i32 %val, ptr addrspace(1) %dst
+  ret void
+}
+
+; GCN:     local_store:
+; CU:        ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; NOCU:      ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; GCN:     .amdhsa_kernel local_store
+; CU:        .amdhsa_uses_cu_stores 1
+; NOCU:     .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @local_store(ptr addrspace(3) %dst, i32 %val) {
+entry:
+  store i32 %val, ptr addrspace(3) %dst
+  ret void
+}
+
+; GCN:     scratch_store:
+; CU:        scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU:      scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN:     .amdhsa_kernel scratch_store
+; CU:        .amdhsa_uses_cu_stores 1
+; NOCU:      .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @scratch_store(ptr addrspace(5) %dst, i32 %val) {
+entry:
+  store i32 %val, ptr addrspace(5) %dst
+  ret void
+}
+
+; GCN:     flat_atomic_store:
+; CU:        flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU:      flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN:     .amdhsa_kernel flat_atomic_store
+; CU:        .amdhsa_uses_cu_stores 1
+; NOCU:      .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @flat_atomic_store(ptr %dst, i32 %val) {
+entry:
+  store atomic i32 %val, ptr %dst syncscope("wavefront") unordered, align 4
+  ret void
+}
+
+; GCN:     global_atomic_store:
+; CU:        global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
+; NOCU:      global_store_b32  v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN:     .amdhsa_kernel global_atomic_store
+; CU:        .amdhsa_uses_cu_stores 1
+; NOCU:      .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @global_atomic_store(ptr addrspace(1) %dst, i32 %val) {
+entry:
+  store atomic i32 %val, ptr addrspace(1) %dst syncscope("wavefront") unordered, align 4
+  ret void
+}
+
+; GCN:     local_atomic_store:
+; CU:        ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; NOCU:      ds_store_b32 v{{.*}}, v{{.*}}{{$}}
+; GCN:     .amdhsa_kernel local_atomic_store
+; CU:        .amdhsa_uses_cu_stores 1
+; NOCU:      .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @local_atomic_store(ptr addrspace(3) %dst, i32 %val) {
+entry:
+  store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
+  ret void
+}
+
+; GCN:     scratch_atomic_store:
+; CU:        scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; NOCU:      scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
+; GCN:     .amdhsa_kernel scratch_atomic_store
+; CU:        .amdhsa_uses_cu_stores 1
+; NOCU:      .amdhsa_uses_cu_stores 0
+define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
+entry:
+  store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
new file mode 100644
index 0000000..d1e82a0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
+
+; Test that stores that may hit scratch are correctly promoted to SCOPE_SE.
+
+define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) {
+; GCN-LABEL: test_scratch_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SE
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+    store i32 %val, ptr addrspace(5) %ptr
+    ret void
+}
+
+define void @test_unknown_flat_store(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_unknown_flat_store:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+    store i32 %val, ptr %ptr
+    ret void
+}
+
+define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 {
+; GCN-LABEL: test_flat_store_no_scratch_alloc:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    flat_store_b32 v[0:1], v2
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+    store i32 %val, ptr %ptr
+    ret void
+}
+
+; TODO: handle
+define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) {
+; GCN-LABEL: test_flat_store_noalias_addrspace:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+    store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6}
+    ret void
+}
+
+; TODO: would be nice to handle too
+define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) {
+; GCN-SDAG-LABEL: test_flat_store_select:
+; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GCN-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-SDAG-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GCN-SDAG-NEXT:    v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GCN-SDAG-NEXT:    flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    s_wait_dscnt 0x0
+; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GCN-GISEL-LABEL: test_flat_store_select:
+; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; GCN-GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GCN-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, s1, vcc_lo
+; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GCN-GISEL-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GCN-GISEL-NEXT:    flat_store_b32 v[0:1], v4 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    s_wait_dscnt 0x0
+; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+    %a.ascast = addrspacecast ptr addrspace(1) %a to ptr
+    %b.ascast = addrspacecast ptr addrspace(3) %b to ptr
+    %ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast
+    store i32 %val, ptr %ptr
+    ret void
+}
+
+attributes #0 = { "amdgpu-no-flat-scratch-init" }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
index 99690e4..fe8edd5 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
@@ -1,4 +1,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+
+; Make sure flag is ignored
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=1 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
 
 ; GFX9-DAG:   buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index fd644a3..3a898a9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GCN-SDAG-NEXT:    s_clause 0xd
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:52
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:48
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:44
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 offset:40
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v44, s32 offset:36
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v45, s32 offset:32
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v56, s32 offset:28
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v57, s32 offset:24
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v58, s32 offset:20
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v59, s32 offset:16
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v60, s32 offset:12
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v61, s32 offset:8
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v62, s32 offset:4
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v63, s32
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v63, s32 scope:SCOPE_SE
 ; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:224
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
 ; GCN-SDAG-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:240
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill
+; GCN-SDAG-NEXT:    scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
 ; GCN-SDAG-NEXT:    s_clause 0xd
 ; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:192
 ; GCN-SDAG-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:208
@@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GCN-GISEL-NEXT:    s_clause 0xf
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:60
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:56
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:52
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:48
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:44
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 offset:40
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v46, s32 offset:36
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v47, s32 offset:32
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v56, s32 offset:28
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v57, s32 offset:24
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v58, s32 offset:20
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v59, s32 offset:16
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v60, s32 offset:12
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v61, s32 offset:8
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v62, s32 offset:4
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v63, s32
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v63, s32 scope:SCOPE_SE
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x8
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
 ; GCN-GISEL-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:32
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT:    scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
 ; GCN-GISEL-NEXT:    s_clause 0xe
 ; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:48
 ; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:64
@@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
 ; GCN-GISEL-NEXT:    global_load_b128 v[60:63], v[0:1], off offset:16
 ; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:240
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
+; GCN-GISEL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill
 ; GCN-GISEL-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GCN-GISEL-NEXT:    s_clause 0xe
@@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GCN-SDAG-NEXT:    s_clause 0x3
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:12
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:8
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:4
-; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
+; GCN-SDAG-NEXT:    scratch_store_b32 off, v43, s32 scope:SCOPE_SE
 ; GCN-SDAG-NEXT:    s_clause 0x7
 ; GCN-SDAG-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:112
 ; GCN-SDAG-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:96
@@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GCN-GISEL-NEXT:    s_clause 0x5
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:20
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:16
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:12
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:8
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:4
-; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
+; GCN-GISEL-NEXT:    scratch_store_b32 off, v45, s32 scope:SCOPE_SE
 ; GCN-GISEL-NEXT:    s_clause 0x7
 ; GCN-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:80
 ; GCN-GISEL-NEXT:    global_load_b128 v[10:13], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 258aa9e..0a493e51 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -8,15 +8,15 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
 ; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
 ; CHECK-NEXT:    s_add_u32 s0, s2, s0
 ; CHECK-NEXT:    s_addc_u32 s1, s3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
 ; CHECK-NEXT:    s_endpgm
@@ -35,15 +35,15 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_ashr_i32 s7, s6, 31
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
 ; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_add_co_u32_e64 v2, vcc, -8, s0
-; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; CHECK-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_add_co_u32_e64 v0, vcc, -8, s0
+; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
index 11de6c8..06c3da0 100644
--- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir
@@ -32,32 +32,14 @@
 # CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
 # CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
 # CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1
-# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
-# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 {
-# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0
-# CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3
-# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
-# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 {
-# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2
+# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
+# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
 # CHECK-NEXT: }
-# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
-# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3
-# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
+# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
+# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub0_sub1:vreg_512_align2 = COPY [[RESTORE2]].sub0_sub1
+# CHECK-NEXT: [[MFMA_USE1]].sub2:vreg_512_align2 = COPY [[SPLIT2]].sub3 
+# CHECK-NEXT: [[MFMA_USE1]].sub3:vreg_512_align2 = COPY [[SPLIT2]].sub2
+# CHECK-NEXT: [[MFMA_USE1]].sub4:vreg_512_align2 = COPY [[SPLIT2]].sub0
 # CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
 # CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
 # CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 1ac75d3..d8c983a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1331,16 +1331,16 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; GFX942-NEXT:    v_mov_b32_e32 v9, 0x5040100
+; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x5040100
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX942-NEXT:    global_load_dwordx4 v[6:9], v4, s[2:3] offset:16
 ; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_perm_b32 v1, s6, v1, v9
+; GFX942-NEXT:    v_perm_b32 v1, s6, v1, v5
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
new file mode 100644
index 0000000..58e9b0a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-avgpr-assign-last.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-regalloc -greedy-regclass-priority-trumps-globalness=1 -start-after=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck %s
+
+--- |
+  define void @temp_vgpr_to_agpr_should_not_undo_split_with_remat() #0 {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = { "amdgpu-agpr-alloc"="0,0" }
+...
+
+
+---
+name:            temp_vgpr_to_agpr_should_not_undo_split_with_remat
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    ; CHECK-LABEL: name: temp_vgpr_to_agpr_should_not_undo_split_with_remat
+    ; CHECK: liveins: $vgpr0, $sgpr4_sgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr2 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr4 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr5 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr6 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr7 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr8 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr9 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr12 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr13 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr14 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr15 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr17 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr18 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr21 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr22 = IMPLICIT_DEF
+    ; CHECK-NEXT: KILL killed renamable $vgpr2, killed renamable $vgpr3, killed renamable $vgpr4, killed renamable $vgpr5, killed renamable $vgpr6, killed renamable $vgpr7, killed renamable $vgpr8, killed renamable $vgpr9, killed renamable $vgpr10, killed renamable $vgpr11, killed renamable $vgpr12, killed renamable $vgpr13, killed renamable $vgpr14, killed renamable $vgpr15, killed renamable $vgpr16
+    ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38
+    ; CHECK-NEXT: S_NOP 0, implicit-def renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+    ; CHECK-NEXT: KILL killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $vgpr17, killed renamable $vgpr18, killed renamable $vgpr19, killed renamable $vgpr20, killed renamable $vgpr21, killed renamable $vgpr22
+    ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, implicit killed renamable $vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54
+    ; CHECK-NEXT: S_ENDPGM 0
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    %6:vgpr_32 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vgpr_32 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    KILL %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    KILL %1, %2, %18, %19, %20, %21, %22, %23
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
index 85dd275..fcdad53 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll
@@ -4,30 +4,30 @@
 define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) {
 ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v32, 0
-; GCN-NEXT:    ds_read_b128 v[0:3], v32
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    ds_read_b128 v[2:5], v0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT:    ds_read_b128 v[28:31], v32 offset:112
-; GCN-NEXT:    ds_read_b128 v[24:27], v32 offset:96
-; GCN-NEXT:    ds_read_b128 v[20:23], v32 offset:80
-; GCN-NEXT:    ds_read_b128 v[16:19], v32 offset:64
-; GCN-NEXT:    ds_read_b128 v[4:7], v32 offset:16
-; GCN-NEXT:    ds_read_b128 v[8:11], v32 offset:32
-; GCN-NEXT:    ds_read_b128 v[12:15], v32 offset:48
+; GCN-NEXT:    ds_read_b128 v[30:33], v0 offset:112
+; GCN-NEXT:    ds_read_b128 v[26:29], v0 offset:96
+; GCN-NEXT:    ds_read_b128 v[22:25], v0 offset:80
+; GCN-NEXT:    ds_read_b128 v[18:21], v0 offset:64
+; GCN-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GCN-NEXT:    ds_read_b128 v[10:13], v0 offset:32
+; GCN-NEXT:    ds_read_b128 v[14:17], v0 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ds_write_b128 v32, v[0:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    ds_write_b128 v0, v[2:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000001)
-; GCN-NEXT:    ds_write_b128 v32, v[28:31] offset:112
-; GCN-NEXT:    ds_write_b128 v32, v[24:27] offset:96
-; GCN-NEXT:    ds_write_b128 v32, v[20:23] offset:80
-; GCN-NEXT:    ds_write_b128 v32, v[16:19] offset:64
-; GCN-NEXT:    ds_write_b128 v32, v[12:15] offset:48
-; GCN-NEXT:    ds_write_b128 v32, v[8:11] offset:32
-; GCN-NEXT:    ds_write_b128 v32, v[4:7] offset:16
-; GCN-NEXT:    ds_write_b64 v32, v[0:1]
+; GCN-NEXT:    ds_write_b128 v0, v[30:33] offset:112
+; GCN-NEXT:    ds_write_b128 v0, v[26:29] offset:96
+; GCN-NEXT:    ds_write_b128 v0, v[22:25] offset:80
+; GCN-NEXT:    ds_write_b128 v0, v[18:21] offset:64
+; GCN-NEXT:    ds_write_b128 v0, v[14:17] offset:48
+; GCN-NEXT:    ds_write_b128 v0, v[10:13] offset:32
+; GCN-NEXT:    ds_write_b128 v0, v[6:9] offset:16
+; GCN-NEXT:    ds_write_b64 v0, v[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
   call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
index ed7d88b..dcac419 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll
@@ -18,19 +18,22 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
 ; GCN-LABEL: load_1d_lwe:
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mov_b32_e32 v6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v9, v8
 ; GCN-NEXT:    v_mov_b32_e32 v10, v8
 ; GCN-NEXT:    v_mov_b32_e32 v11, v8
 ; GCN-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-NEXT:    v_mov_b32_e32 v0, v8
-; GCN-NEXT:    v_mov_b32_e32 v1, v9
-; GCN-NEXT:    v_mov_b32_e32 v2, v10
-; GCN-NEXT:    v_mov_b32_e32 v3, v11
-; GCN-NEXT:    v_mov_b32_e32 v4, v12
-; GCN-NEXT:    image_load v[0:4], v6, s[0:7] dmask:0xf unorm lwe
+; GCN-NEXT:    v_mov_b32_e32 v2, v8
+; GCN-NEXT:    v_mov_b32_e32 v3, v9
+; GCN-NEXT:    v_mov_b32_e32 v4, v10
+; GCN-NEXT:    v_mov_b32_e32 v5, v11
+; GCN-NEXT:    v_mov_b32_e32 v6, v12
+; GCN-NEXT:    image_load v[2:6], v0, s[0:7] dmask:0xf unorm lwe
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dword v8, v4, s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v2, v4
+; GCN-NEXT:    v_mov_b32_e32 v3, v5
+; GCN-NEXT:    global_store_dword v8, v6, s[8:9]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
@@ -75,6 +78,27 @@ main_body:
 }
 
 define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_cube_lwe:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, v10
+; GCN-NEXT:    v_mov_b32_e32 v12, v10
+; GCN-NEXT:    v_mov_b32_e32 v13, v10
+; GCN-NEXT:    v_mov_b32_e32 v14, v10
+; GCN-NEXT:    v_mov_b32_e32 v4, v10
+; GCN-NEXT:    v_mov_b32_e32 v5, v11
+; GCN-NEXT:    v_mov_b32_e32 v6, v12
+; GCN-NEXT:    v_mov_b32_e32 v7, v13
+; GCN-NEXT:    v_mov_b32_e32 v8, v14
+; GCN-NEXT:    image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-NEXT:    v_mov_b32_e32 v1, v5
+; GCN-NEXT:    v_mov_b32_e32 v2, v6
+; GCN-NEXT:    v_mov_b32_e32 v3, v7
+; GCN-NEXT:    global_store_dword v10, v8, s[8:9]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
@@ -106,6 +130,27 @@ main_body:
 }
 
 define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+; GCN-LABEL: load_2darray_lwe:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, v10
+; GCN-NEXT:    v_mov_b32_e32 v12, v10
+; GCN-NEXT:    v_mov_b32_e32 v13, v10
+; GCN-NEXT:    v_mov_b32_e32 v14, v10
+; GCN-NEXT:    v_mov_b32_e32 v4, v10
+; GCN-NEXT:    v_mov_b32_e32 v5, v11
+; GCN-NEXT:    v_mov_b32_e32 v6, v12
+; GCN-NEXT:    v_mov_b32_e32 v7, v13
+; GCN-NEXT:    v_mov_b32_e32 v8, v14
+; GCN-NEXT:    image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-NEXT:    v_mov_b32_e32 v1, v5
+; GCN-NEXT:    v_mov_b32_e32 v2, v6
+; GCN-NEXT:    v_mov_b32_e32 v3, v7
+; GCN-NEXT:    global_store_dword v10, v8, s[8:9]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
   %v.vec = extractvalue {<4 x float>, i32} %v, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 1585a2c..303ea50 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
@@ -9,50 +10,199 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
-; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG:         s_load_dwordx16
-; GCN-DAG:         s_load_dwordx16
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX908-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN:             v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX908:          global_store_dwordx4
-; GFX90A-NOT:      v_accvgpr_read_b32
-; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x2bf16:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v0, s16
+; GFX908-NEXT:    v_mov_b32_e32 v1, s17
+; GFX908-NEXT:    v_mov_b32_e32 v2, s18
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s21
+; GFX908-NEXT:    v_mov_b32_e32 v1, s22
+; GFX908-NEXT:    v_mov_b32_e32 v2, s23
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s24
+; GFX908-NEXT:    v_mov_b32_e32 v1, s25
+; GFX908-NEXT:    v_mov_b32_e32 v2, s26
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s27
+; GFX908-NEXT:    v_mov_b32_e32 v1, s28
+; GFX908-NEXT:    v_mov_b32_e32 v2, s29
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s30
+; GFX908-NEXT:    v_mov_b32_e32 v1, s31
+; GFX908-NEXT:    v_mov_b32_e32 v2, s0
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s1
+; GFX908-NEXT:    v_mov_b32_e32 v1, s2
+; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s4
+; GFX908-NEXT:    v_mov_b32_e32 v1, s5
+; GFX908-NEXT:    v_mov_b32_e32 v2, s6
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s7
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    v_mov_b32_e32 v2, s9
+; GFX908-NEXT:    v_mov_b32_e32 v3, s19
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s10
+; GFX908-NEXT:    v_mov_b32_e32 v1, s11
+; GFX908-NEXT:    v_mov_b32_e32 v2, s12
+; GFX908-NEXT:    v_mov_b32_e32 v5, s20
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v1, s14
+; GFX908-NEXT:    v_mov_b32_e32 v2, s15
+; GFX908-NEXT:    v_mov_b32_e32 v3, 1
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, 2
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_32x32x2bf16:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s31
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
@@ -62,18 +212,109 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
-; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG:         s_load_dwordx16
-; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN:             v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-16: v_accvgpr_read_b32
-; GFX908:          global_store_dwordx4
-; GFX90A-NOT:      v_accvgpr_read_b32
-; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_16x16x2bf16:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1
+; GFX908-NEXT:    v_mov_b32_e32 v12, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v13, s0
+; GFX908-NEXT:    v_mov_b32_e32 v1, s1
+; GFX908-NEXT:    v_mov_b32_e32 v2, s2
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v13
+; GFX908-NEXT:    v_mov_b32_e32 v13, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v13
+; GFX908-NEXT:    v_mov_b32_e32 v1, s4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s5
+; GFX908-NEXT:    v_mov_b32_e32 v13, s6
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v13
+; GFX908-NEXT:    v_mov_b32_e32 v1, s7
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    v_mov_b32_e32 v13, s9
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v13
+; GFX908-NEXT:    v_mov_b32_e32 v1, s10
+; GFX908-NEXT:    v_mov_b32_e32 v2, s11
+; GFX908-NEXT:    v_mov_b32_e32 v13, s12
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v13
+; GFX908-NEXT:    v_mov_b32_e32 v1, s13
+; GFX908-NEXT:    v_mov_b32_e32 v2, s14
+; GFX908-NEXT:    v_mov_b32_e32 v13, s15
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v13
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a4
+; GFX908-NEXT:    global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v12, v[0:3], s[16:17]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_16x16x2bf16:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
@@ -83,18 +324,53 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
-; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN:            s_load_dwordx4
-; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN:            v_mfma_f32_4x4x2bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-4: v_accvgpr_read_b32
-; GFX908:         global_store_dwordx4
-; GFX90A-NOT:     v_accvgpr_read_b32
-; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_4x4x2bf16:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v5, s0
+; GFX908-NEXT:    v_mov_b32_e32 v2, s1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s2
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v5, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 3
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_4x4x2bf16:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_4x4x2bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
@@ -104,18 +380,110 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
-; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG:         s_load_dwordx16
-; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4:  v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN:             v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-16: v_accvgpr_read_b32
-; GFX908:          global_store_dwordx4
-; GFX90A-NOT:      v_accvgpr_read_b32
-; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}],
 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x4bf16:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1
+; GFX908-NEXT:    v_mov_b32_e32 v16, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v17, s0
+; GFX908-NEXT:    v_mov_b32_e32 v1, s1
+; GFX908-NEXT:    v_mov_b32_e32 v2, s2
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v17
+; GFX908-NEXT:    v_mov_b32_e32 v17, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s5
+; GFX908-NEXT:    v_mov_b32_e32 v17, s6
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s7
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    v_mov_b32_e32 v17, s9
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s10
+; GFX908-NEXT:    v_mov_b32_e32 v2, s11
+; GFX908-NEXT:    v_mov_b32_e32 v17, s12
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s13
+; GFX908-NEXT:    v_mov_b32_e32 v2, s14
+; GFX908-NEXT:    v_mov_b32_e32 v17, s15
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a4
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a0
+; GFX908-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:16
+; GFX908-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_32x32x4bf16:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
@@ -125,18 +493,55 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
-; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN:            s_load_dwordx4
-; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN:            v_mfma_f32_16x16x8bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-4: v_accvgpr_read_b32
-; GFX908:         global_store_dwordx4
-; GFX90A-NOT:     v_accvgpr_read_b32
-; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_16x16x8bf16:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v5, s0
+; GFX908-NEXT:    v_mov_b32_e32 v2, s1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s2
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v5, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_f32_16x16x8bf16:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i32 1 to <2 x i16>
@@ -147,3 +552,5 @@ bb:
 }
 
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 4c26961..ff77d5cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,5 +1,8 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
@@ -10,17 +13,238 @@ declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x doubl
 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
-; GCN-DAG:     s_load_dwordx16
-; GCN-DAG:     s_load_dwordx16
-; GCN-DAG:     v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG:     v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A:      v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942:      v_mfma_f32_32x32x4_2b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s16
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s17
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s18
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s19
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s20
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s21
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s22
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s23
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s24
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s25
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s26
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s27
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s28
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s29
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s30
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s31
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[16:19], s[34:35] offset:64
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[20:23], s[34:35] offset:80
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[8:11], s[34:35] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[12:15], s[34:35] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[34:35]
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, s16
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, s17
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, s18
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, s19
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, s20
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, s21
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, s22
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, s23
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, s24
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, s25
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, s26
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, s27
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, s28
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, s29
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, s30
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, s31
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, s0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, s2
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, s3
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, s4
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, s5
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, s6
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, s7
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, s8
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, s9
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, s10
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, s11
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, s12
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, s13
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, s14
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 2
+; GFX942-NEXT:    global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v1, a[16:19], s[34:35] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v1, a[20:23], s[34:35] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[34:35] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[34:35] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[34:35]
+; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v33, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v34, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v35, v33
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v32, 2
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, s20
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, s21
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, s22
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, s23
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, s24
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, s25
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s26
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s27
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s28
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s29
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v14, s30
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v15, s31
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v16, s0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v17, s1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v20, s4
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v21, s5
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v22, s6
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v23, s7
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v24, s8
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v25, s9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v26, s10
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v27, s11
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v28, s12
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v29, s13
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v30, s14
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v31, s15
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v35, v33
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 2
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
@@ -30,16 +254,134 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
-; GCN-DAG:      s_load_dwordx16
-; GCN-DAG:      v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG:      v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A:       v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942:       v_mfma_f32_16x16x4_4b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:      v_accvgpr_read_b32
-; GCN-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 2
+; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
@@ -49,16 +391,82 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
-; GCN-DAG:     s_load_dwordx4
-; GCN-DAG:     v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG:     v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A:      v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942:      v_mfma_f32_4x4x4_16b_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_4x4x4bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    s_nop 4
+; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    s_nop 4
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 4
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
@@ -68,16 +476,136 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
-; GCN-DAG:      s_load_dwordx16
-; GCN-DAG:      v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG:      v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A:       v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942:       v_mfma_f32_32x32x8_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:      v_accvgpr_read_b32
-; GCN-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 2
+; GFX942-NEXT:    global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v17, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, v17
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v16, 2
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 2
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v17, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
@@ -87,16 +615,84 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
-; GCN-DAG:     s_load_dwordx4
-; GCN-DAG:     v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GCN-DAG:     v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX90A:      v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942:      v_mfma_f32_16x16x16_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, [[RES]],
 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 {
+; GFX90A-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    s_nop 6
+; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 2
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 6
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %a = bitcast i64 1 to <4 x i16>
@@ -106,13 +702,70 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64:
-; GFX90A: v_mfma_f64_4x4x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX90A: v_mfma_f64_4x4x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_4x4x4_4b_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX942: v_mfma_f64_4x4x4_4b_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx2
 define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_4x4x4f64:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
+; GFX90A-NEXT:    s_nop 3
+; GFX90A-NEXT:    v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_4x4x4f64:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], 0
+; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
+; GFX90A-VGPR-NEXT:    s_nop 3
+; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
+; GFX942-VGPR-NEXT:    s_nop 3
+; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
   %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
@@ -120,13 +773,110 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64:
-; GCN:    s_load_dwordx8
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s11
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s10
+; GFX90A-VGPR-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s11
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s10
+; GFX942-VGPR-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s11
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x double>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
@@ -134,14 +884,78 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_0:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -149,14 +963,78 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 -1 to double)), i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -164,14 +1042,78 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_1:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 1.0{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 1.0{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double 1.0), i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -179,14 +1121,78 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_neg1:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1.0{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], -1.0{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double -1.0), i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -194,14 +1200,78 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64:
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 64{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 64 to double)), i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -209,23 +1279,116 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 0{{$}}
-; GCN: v_accvgpr_write_b32 a[[A_HIGH_BITS_0:[0-9]+]], 64
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_HIGH_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_HIGH_BITS_0]]
-
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 64
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a1
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, 64
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -233,23 +1396,110 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 64{{$}}
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
-
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 64
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 64
+; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 64
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 64
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877907008 to double)), i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -257,23 +1507,110 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
-; GCN: v_accvgpr_write_b32 a[[A_LOW_BITS_0:[0-9]+]], 1.0
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, a[[A_LOW_BITS_0]]
-; GCN: v_accvgpr_mov_b32 a[[LAST_CONST_REG:[0-9]+]], a[[A_LOW_BITS_0]]
-
-; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
-; GFX942: v_mfma_f64_16x16x4_f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a{{\[}}[[A_LOW_BITS_0]]:[[LAST_CONST_REG]]{{\]$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 neg:[1,1,0]
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 0
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (<2 x float> splat (float 1.0) to double)), i32 0, i32 0, i32 0)
   %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
@@ -281,26 +1618,236 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_imm:
-; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GFX942: v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GCN:    global_store_dwordx4
-; GCN:    global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
   store <4 x double> %mai.1, ptr addrspace(1) %arg
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit:
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
-; GFX90A:  v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GFX942:  v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
-; GCN:     global_store_dwordx4
-; GCN:     global_store_dwordx4
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
+; GFX90A-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x405ec000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a1
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x405ec000
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX90A-VGPR:       ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 7
+; GFX90A-VGPR-NEXT:    s_nop 1
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
   store <4 x double> %mai.1, ptr addrspace(1) %arg
@@ -308,3 +1855,6 @@ bb:
 }
 
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; VGPR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index b792a12..beda16c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1,12 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,VGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,AGPRCD %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL,AGPRCD %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s
 
 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
@@ -33,17 +30,132 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32>, <4 x i3
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
 
-; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[[[TWO]]:[[ONE]]], v[[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -51,17 +163,154 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8:
-; GFX942-DAG:   v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:   v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:   v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:   v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:       v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[[[TWO]]:[[ONE]]], v[[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:        v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:      v_accvgpr_read_b32
-; GCN-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_i32_32x32x16i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_i32_32x32x16i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_i32_32x32x16i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_i32_32x32x16i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -69,17 +318,132 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_16x16x32_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -87,17 +451,132 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_16x16x32_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -105,17 +584,132 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_16x16x32_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -123,17 +717,132 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_16x16x32_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -141,17 +850,154 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_bf8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_32x32x16_bf8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -159,17 +1005,154 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_bf8_fp8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_32x32x16_bf8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -177,17 +1160,154 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_bf8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_32x32x16_fp8_bf8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -195,17 +1315,154 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x16_fp8_fp8:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 3
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_32x32x16_fp8_fp8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
+;
+; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, 4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 4294967298, i64 12884901892, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -213,15 +1470,132 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_f16:
-; GCN:        s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_16x16x32_f16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-AGPRCD:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-AGPRCD-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-AGPRCD-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-AGPRCD-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-NEXT:    s_nop 1
+; GFX942-AGPRCD-NEXT:    v_smfmac_f32_16x16x32_f16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-NEXT:    s_nop 5
+; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX942-AGPRCD-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-AGPRCD:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-AGPRCD-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-AGPRCD-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-AGPRCD-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-NEXT:    s_nop 1
+; GFX950-AGPRCD-NEXT:    v_smfmac_f32_16x16x32_f16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-NEXT:    s_nop 6
+; GFX950-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX950-AGPRCD-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -229,18 +1603,278 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_f16:
-; GCN:        s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_32x32x16_f16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -248,15 +1882,132 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_f32_16x16x32_bf16:
-; GCN:        s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_16x16x32_bf16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-AGPRCD:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-AGPRCD-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-AGPRCD-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-AGPRCD-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-NEXT:    s_nop 1
+; GFX942-AGPRCD-NEXT:    v_smfmac_f32_16x16x32_bf16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-NEXT:    s_nop 5
+; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX942-AGPRCD-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-AGPRCD:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-AGPRCD-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-AGPRCD-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-AGPRCD-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-NEXT:    s_nop 1
+; GFX950-AGPRCD-NEXT:    v_smfmac_f32_16x16x32_bf16 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-NEXT:    s_nop 6
+; GFX950-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX950-AGPRCD-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -264,18 +2015,278 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_f32_32x32x16_bf16:
-; GCN:        s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_32x32x16_bf16 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[24:25], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[30:31]
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -283,15 +2294,214 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_i8:
-; GCN:        s_load_dwordx4 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_i32_16x16x64_i8 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:[[RHI]]]
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2)
@@ -299,18 +2509,310 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_i8:
-; GCN:        s_load_dwordx16 s[[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]][[[RLO:[0-9]+]]:{{[0-9]+}}], s[[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_i32_32x32x32_i8 [[CD]][[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2)
@@ -318,15 +2820,214 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_bf8:
-; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_16x16x64_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -334,15 +3035,214 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_bf8_fp8:
-; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_16x16x64_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -350,15 +3250,214 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_bf8:
-; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_16x16x64_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -366,15 +3465,214 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_16x16x64_fp8_fp8:
-; GCN:        s_load_dwordx4 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_16x16x64_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN:        global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:[[RHI]]]
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s2
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s3
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_writelane_b32 v7, s0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-GISEL-NEXT:    v_readlane_b32 s0, v7, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 a[0:3], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 6
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -382,18 +3680,310 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_bf8:
-; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_32x32x32_bf8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -401,18 +3991,310 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_bf8_fp8:
-; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_32x32x32_bf8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -420,18 +4302,310 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_bf8:
-; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_32x32x32_fp8_bf8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -439,18 +4613,310 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_smfmac_i32_32x32x32_fp8_fp8:
-; GCN:        s_load_dwordx16 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 [[CD:v]]{{\[}}[[RLO:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SLO]]:{{[0-9]+}}]{{$}}
-; VGPRCD-DAG: v_mov_b64_e32 v[{{[0-9]+}}:[[RHI:[0-9]+]]], s[{{[0-9]+}}:[[SHI]]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 [[CD:a]][[RLO:[0-9]+]], s[[SLO]]{{$}}
-; AGPRCD-DAG: v_accvgpr_write_b32 a[[RHI:[0-9]+]], s[[SHI]]{{$}}
-; GCN:        v_smfmac_f32_32x32x32_fp8_fp8 [[CD]]{{\[}}[[RLO]]:[[RHI]]], {{[av]}}[{{[0-9:]+}}], {{[av]}}[{{[0-9:]+}}], v{{[0-9]+}} cbsz:1 abid:2
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]]{{\[}}[[RLO]]:{{[0-9]+}}], s[{{[0-9:]+}}]{{$}}
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:16
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9:]+}}], s[{{[0-9:]+}}] offset:32
-; GCN-DAG:    global_store_dwordx4 v{{[0-9]+}}, [[CD]][{{[0-9]+}}:[[RHI]]], s[{{[0-9:]+}}] offset:48
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
+; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX942-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX942-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX942-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX942-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX942-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s8
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s9
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, s10
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s11
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT:    s_nop 2
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
+;
+; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x2c
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
+; GFX950-AGPRCD-GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x3c
+; GFX950-AGPRCD-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x44
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s8, s26
+; GFX950-AGPRCD-GISEL-NEXT:    s_mov_b32 s9, s27
+; GFX950-AGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 1
+; GFX950-AGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 a[0:15], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-AGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 7
+; GFX950-AGPRCD-GISEL-NEXT:    s_nop 2
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX950-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX950-AGPRCD-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
@@ -459,3 +4925,8 @@ bb:
 }
 
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX942: {{.*}}
+; GFX942-VGPRCD: {{.*}}
+; GFX950: {{.*}}
+; GFX950-VGPRCD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 452033f..8081a15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,9 +15,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 16
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
@@ -39,42 +39,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
 ; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
-; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT:    v_mov_b32_e32 v16, s16
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NEXT:    v_mov_b32_e32 v1, s21
 ; GCN-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NEXT:    v_mov_b32_e32 v9, s17
-; GCN-NEXT:    v_mov_b32_e32 v10, s18
-; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    v_mov_b32_e32 v17, s17
+; GCN-NEXT:    v_mov_b32_e32 v18, s18
+; GCN-NEXT:    v_mov_b32_e32 v19, s19
 ; GCN-NEXT:    s_nop 4
-; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -88,9 +88,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT:    v_mov_b64_e32 v[12:13], 48
-; GCN-NEXT:    v_mov_b64_e32 v[14:15], 32
-; GCN-NEXT:    v_mov_b64_e32 v[16:17], 16
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], 48
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], 32
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], 16
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
@@ -112,42 +112,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
 ; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
 ; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
-; GCN-NEXT:    v_mov_b64_e32 v[18:19], 0
-; GCN-NEXT:    v_mov_b32_e32 v8, s16
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT:    v_mov_b32_e32 v16, s16
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NEXT:    v_mov_b32_e32 v1, s21
 ; GCN-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NEXT:    v_mov_b32_e32 v9, s17
-; GCN-NEXT:    v_mov_b32_e32 v10, s18
-; GCN-NEXT:    v_mov_b32_e32 v11, s19
+; GCN-NEXT:    v_mov_b32_e32 v17, s17
+; GCN-NEXT:    v_mov_b32_e32 v18, s18
+; GCN-NEXT:    v_mov_b32_e32 v19, s19
 ; GCN-NEXT:    s_nop 4
-; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v44, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT:    v_accvgpr_write_b32 a31, s23
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT:    v_accvgpr_write_b32 a30, s22
-; GCN-NEXT:    v_accvgpr_write_b32 a29, s21
-; GCN-NEXT:    v_accvgpr_write_b32 a28, s20
-; GCN-NEXT:    v_accvgpr_write_b32 a27, s19
-; GCN-NEXT:    v_accvgpr_write_b32 a26, s18
-; GCN-NEXT:    v_accvgpr_write_b32 a25, s17
-; GCN-NEXT:    v_accvgpr_write_b32 a24, s16
-; GCN-NEXT:    v_accvgpr_write_b32 a23, s15
-; GCN-NEXT:    v_accvgpr_write_b32 a22, s14
-; GCN-NEXT:    v_accvgpr_write_b32 a21, s13
-; GCN-NEXT:    v_accvgpr_write_b32 a20, s12
-; GCN-NEXT:    v_accvgpr_write_b32 a19, s11
-; GCN-NEXT:    v_accvgpr_write_b32 a18, s10
-; GCN-NEXT:    v_accvgpr_write_b32 a17, s9
-; GCN-NEXT:    v_accvgpr_write_b32 a16, s8
-; GCN-NEXT:    v_mov_b32_e32 v8, s20
-; GCN-NEXT:    v_mov_b32_e32 v9, s21
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31]
-; GCN-NEXT:    v_mov_b32_e32 v10, s22
-; GCN-NEXT:    v_mov_b32_e32 v11, s23
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v40, s20
+; GCN-NEXT:    v_mov_b32_e32 v41, s21
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
+; GCN-NEXT:    v_mov_b32_e32 v42, s22
+; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_mov_b32_e32 v16, s16
+; GCN-NEXT:    v_mov_b32_e32 v17, s17
+; GCN-NEXT:    v_mov_b32_e32 v18, s18
+; GCN-NEXT:    v_mov_b32_e32 v19, s19
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s12
+; GCN-NEXT:    v_mov_b32_e32 v17, s13
+; GCN-NEXT:    v_mov_b32_e32 v18, s14
+; GCN-NEXT:    v_mov_b32_e32 v19, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s8
+; GCN-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-NEXT:    v_mov_b32_e32 v18, s10
+; GCN-NEXT:    v_mov_b32_e32 v19, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v44, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT:    v_accvgpr_write_b32 a31, s23
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT:    v_accvgpr_write_b32 a30, s22
-; GCN-NEXT:    v_accvgpr_write_b32 a29, s21
-; GCN-NEXT:    v_accvgpr_write_b32 a28, s20
-; GCN-NEXT:    v_accvgpr_write_b32 a27, s19
-; GCN-NEXT:    v_accvgpr_write_b32 a26, s18
-; GCN-NEXT:    v_accvgpr_write_b32 a25, s17
-; GCN-NEXT:    v_accvgpr_write_b32 a24, s16
-; GCN-NEXT:    v_accvgpr_write_b32 a23, s15
-; GCN-NEXT:    v_accvgpr_write_b32 a22, s14
-; GCN-NEXT:    v_accvgpr_write_b32 a21, s13
-; GCN-NEXT:    v_accvgpr_write_b32 a20, s12
-; GCN-NEXT:    v_accvgpr_write_b32 a19, s11
-; GCN-NEXT:    v_accvgpr_write_b32 a18, s10
-; GCN-NEXT:    v_accvgpr_write_b32 a17, s9
-; GCN-NEXT:    v_accvgpr_write_b32 a16, s8
-; GCN-NEXT:    v_mov_b32_e32 v8, s20
-; GCN-NEXT:    v_mov_b32_e32 v9, s21
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT:    v_mov_b32_e32 v10, s22
-; GCN-NEXT:    v_mov_b32_e32 v11, s23
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; GCN-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v40, s20
+; GCN-NEXT:    v_mov_b32_e32 v41, s21
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v42, s22
+; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_mov_b32_e32 v16, s16
+; GCN-NEXT:    v_mov_b32_e32 v17, s17
+; GCN-NEXT:    v_mov_b32_e32 v18, s18
+; GCN-NEXT:    v_mov_b32_e32 v19, s19
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s12
+; GCN-NEXT:    v_mov_b32_e32 v17, s13
+; GCN-NEXT:    v_mov_b32_e32 v18, s14
+; GCN-NEXT:    v_mov_b32_e32 v19, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s8
+; GCN-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-NEXT:    v_mov_b32_e32 v18, s10
+; GCN-NEXT:    v_mov_b32_e32 v19, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
@@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
   store <16 x float> %result, ptr addrspace(1) %out
@@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, s12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, s13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, s14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, s15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, s16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, s17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, s18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, s19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, s20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, s21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, s22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
   store <16 x float> %result, ptr addrspace(1) %out
   ret void
 }
 
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
 attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 9bdae28f..d81ec1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -394,9 +382,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
@@ -418,42 +406,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
 ; SDAG-NEXT:    s_nop 4
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s10
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -518,9 +506,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
@@ -542,42 +530,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
 ; HEURRC-NEXT:    s_nop 4
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -585,9 +573,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT:    v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
@@ -601,43 +589,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v48, s16
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
-; VGPRRC-NEXT:    v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT:    v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
 ; VGPRRC-NEXT:    s_nop 7
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -776,9 +764,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 48
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[16:17], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
@@ -800,42 +788,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[18:19], 0
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
 ; SDAG-NEXT:    s_nop 4
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s10
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -900,9 +888,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 48
-; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 32
-; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
@@ -924,42 +912,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], 0
-; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
 ; HEURRC-NEXT:    s_nop 4
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -967,9 +955,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 48
-; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 32
-; VGPRRC-NEXT:    v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
@@ -983,43 +971,43 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[50:51], 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v48, s16
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT:    v_mov_b32_e32 v41, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v42, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT:    v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
 ; VGPRRC-NEXT:    s_nop 7
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v44, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v40, s20
+; SDAG-NEXT:    v_mov_b32_e32 v41, s21
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT:    v_mov_b32_e32 v42, s22
+; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT:    v_mov_b32_e32 v24, 0
+; GISEL-NEXT:    v_mov_b32_e32 v56, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
 ;
@@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v44, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT:    v_mov_b32_e32 v40, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v41, s21
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v44, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v40, s20
+; SDAG-NEXT:    v_mov_b32_e32 v41, s21
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT:    v_mov_b32_e32 v42, s22
+; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT:    v_mov_b32_e32 v24, 0
+; GISEL-NEXT:    v_mov_b32_e32 v56, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
 ;
@@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v44, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT:    v_mov_b32_e32 v40, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v41, s21
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
 ; HEURRC-NEXT:    s_nop 7
 ; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
 ; HEURRC-NEXT:    s_nop 7
 ; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2781,7 +2677,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
@@ -2791,14 +2687,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2827,7 +2721,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
@@ -2837,14 +2731,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s0
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s1
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2930,7 +2824,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
@@ -2940,14 +2834,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2976,7 +2868,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
@@ -2986,14 +2878,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; HEURRC-NEXT:    v_mov_b32_e32 v5, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v6, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v7, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s0
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s1
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3084,19 +2976,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s24
-; SDAG-NEXT:    v_mov_b32_e32 v1, s25
-; SDAG-NEXT:    v_mov_b32_e32 v2, s26
-; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v8, s24
+; SDAG-NEXT:    v_mov_b32_e32 v9, s25
+; SDAG-NEXT:    v_mov_b32_e32 v10, s26
+; SDAG-NEXT:    v_mov_b32_e32 v11, s27
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v4, s28
-; SDAG-NEXT:    v_mov_b32_e32 v5, s29
-; SDAG-NEXT:    v_mov_b32_e32 v6, s30
-; SDAG-NEXT:    v_mov_b32_e32 v7, s31
+; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v13, s29
+; SDAG-NEXT:    v_mov_b32_e32 v14, s30
+; SDAG-NEXT:    v_mov_b32_e32 v15, s31
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -3112,44 +3004,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; SDAG-NEXT:    v_mov_b32_e32 v8, s16
+; SDAG-NEXT:    v_mov_b32_e32 v9, s17
+; SDAG-NEXT:    v_mov_b32_e32 v10, s18
+; SDAG-NEXT:    v_mov_b32_e32 v11, s19
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -3214,19 +3104,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s27
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s28
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s29
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s30
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s31
+; HEURRC-NEXT:    v_mov_b32_e32 v12, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v13, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v14, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v15, s31
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -3242,44 +3132,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -3287,19 +3175,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s27
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT:    v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT:    v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT:    v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT:    v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v47, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -3307,45 +3195,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15]
 ; VGPRRC-NEXT:    s_nop 7
 ; VGPRRC-NEXT:    s_nop 3
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s17
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3496,19 +3384,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s24
-; SDAG-NEXT:    v_mov_b32_e32 v1, s25
-; SDAG-NEXT:    v_mov_b32_e32 v2, s26
-; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v8, s24
+; SDAG-NEXT:    v_mov_b32_e32 v9, s25
+; SDAG-NEXT:    v_mov_b32_e32 v10, s26
+; SDAG-NEXT:    v_mov_b32_e32 v11, s27
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v4, s28
-; SDAG-NEXT:    v_mov_b32_e32 v5, s29
-; SDAG-NEXT:    v_mov_b32_e32 v6, s30
-; SDAG-NEXT:    v_mov_b32_e32 v7, s31
+; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v13, s29
+; SDAG-NEXT:    v_mov_b32_e32 v14, s30
+; SDAG-NEXT:    v_mov_b32_e32 v15, s31
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -3524,44 +3412,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT:    v_mov_b32_e32 v8, s16
+; SDAG-NEXT:    v_mov_b32_e32 v9, s17
+; SDAG-NEXT:    v_mov_b32_e32 v10, s18
+; SDAG-NEXT:    v_mov_b32_e32 v11, s19
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -3626,19 +3512,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s27
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s28
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s29
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s30
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s31
+; HEURRC-NEXT:    v_mov_b32_e32 v12, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v13, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v14, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v15, s31
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -3654,44 +3540,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s19
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -3699,19 +3583,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s24
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s25
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s26
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s27
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b32_e32 v36, s28
-; VGPRRC-NEXT:    v_mov_b32_e32 v37, s29
-; VGPRRC-NEXT:    v_mov_b32_e32 v38, s30
-; VGPRRC-NEXT:    v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT:    v_mov_b32_e32 v44, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v45, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v46, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v47, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -3719,45 +3603,45 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
 ; VGPRRC-NEXT:    s_nop 3
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s17
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4254,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; SDAG-NEXT:    v_mov_b32_e32 v40, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b32_e32 v4, s24
-; SDAG-NEXT:    v_mov_b32_e32 v5, s25
-; SDAG-NEXT:    v_mov_b32_e32 v6, s26
-; SDAG-NEXT:    v_mov_b32_e32 v7, s27
+; SDAG-NEXT:    v_mov_b32_e32 v36, s24
+; SDAG-NEXT:    v_mov_b32_e32 v37, s25
+; SDAG-NEXT:    v_mov_b32_e32 v38, s26
+; SDAG-NEXT:    v_mov_b32_e32 v39, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -4326,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT:    v_mov_b32_e32 v24, 0
+; GISEL-NEXT:    v_mov_b32_e32 v56, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
 ;
@@ -4379,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v40, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v36, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v37, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v38, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v39, s27
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT:    s_nop 6
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -4653,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; SDAG-NEXT:    v_mov_b32_e32 v40, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b32_e32 v4, s24
-; SDAG-NEXT:    v_mov_b32_e32 v5, s25
-; SDAG-NEXT:    v_mov_b32_e32 v6, s26
-; SDAG-NEXT:    v_mov_b32_e32 v7, s27
+; SDAG-NEXT:    v_mov_b32_e32 v36, s24
+; SDAG-NEXT:    v_mov_b32_e32 v37, s25
+; SDAG-NEXT:    v_mov_b32_e32 v38, s26
+; SDAG-NEXT:    v_mov_b32_e32 v39, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -4725,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT:    v_mov_b32_e32 v24, 0
+; GISEL-NEXT:    v_mov_b32_e32 v56, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT:    global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
 ;
@@ -4778,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v40, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v36, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v37, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v38, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v39, s27
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT:    s_nop 6
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s19
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -5053,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b32_e32 v4, s24
-; SDAG-NEXT:    v_mov_b32_e32 v5, s25
-; SDAG-NEXT:    v_mov_b32_e32 v6, s26
-; SDAG-NEXT:    v_mov_b32_e32 v7, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s24
+; SDAG-NEXT:    v_mov_b32_e32 v21, s25
+; SDAG-NEXT:    v_mov_b32_e32 v22, s26
+; SDAG-NEXT:    v_mov_b32_e32 v23, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5096,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5132,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v20, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v21, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v22, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v23, s27
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
 ; HEURRC-NEXT:    s_nop 7
 ; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5287,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b32_e32 v4, s24
-; SDAG-NEXT:    v_mov_b32_e32 v5, s25
-; SDAG-NEXT:    v_mov_b32_e32 v6, s26
-; SDAG-NEXT:    v_mov_b32_e32 v7, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s24
+; SDAG-NEXT:    v_mov_b32_e32 v21, s25
+; SDAG-NEXT:    v_mov_b32_e32 v22, s26
+; SDAG-NEXT:    v_mov_b32_e32 v23, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5330,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5366,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v20, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v21, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v22, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v23, s27
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT:    v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT:    v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT:    v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT:    v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT:    v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT:    v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT:    v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT:    v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT:    v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT:    v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT:    v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT:    v_mov_b32_e32 v0, 0
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
 ; HEURRC-NEXT:    s_nop 7
 ; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5651,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5672,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5755,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5776,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5853,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
   ret void
 }
 
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
 attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index ccee113..856185b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -1,22 +1,116 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
 
 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
 
-; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
-; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN-DAG:         s_load_dwordx16
-; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN:             v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-16: v_accvgpr_read_b32
-; GFX908:          global_store_dwordx4
-; GFX90A-NOT:      v_accvgpr_read_b32
-; GFX90A-COUNT-4:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_i32_32x32x8i8:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1
+; GFX908-NEXT:    v_mov_b32_e32 v16, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v17, s0
+; GFX908-NEXT:    v_mov_b32_e32 v1, s1
+; GFX908-NEXT:    v_mov_b32_e32 v2, s2
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v17
+; GFX908-NEXT:    v_mov_b32_e32 v17, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s4
+; GFX908-NEXT:    v_mov_b32_e32 v2, s5
+; GFX908-NEXT:    v_mov_b32_e32 v17, s6
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s7
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    v_mov_b32_e32 v17, s9
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s10
+; GFX908-NEXT:    v_mov_b32_e32 v2, s11
+; GFX908-NEXT:    v_mov_b32_e32 v17, s12
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, s13
+; GFX908-NEXT:    v_mov_b32_e32 v2, s14
+; GFX908-NEXT:    v_mov_b32_e32 v17, s15
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX908-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_32x32x8i8:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -24,18 +118,55 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
-; GCN-DAG:        v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; GCN-DAG:        v_mov_b32_e32 [[ONE:v[0-9]+]], 1
-; GCN:            s_load_dwordx4
-; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GCN:            v_mfma_i32_16x16x16i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GFX908-COUNT-4: v_accvgpr_read_b32
-; GFX908:         global_store_dwordx4
-; GFX90A-NOT:     v_accvgpr_read_b32
-; GFX90A:         global_store_dwordx4 v{{[0-9]+}}, [[RES]]
 define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_i32_16x16x16i8:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v5, s0
+; GFX908-NEXT:    v_mov_b32_e32 v2, s1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s2
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v5, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_16x16x16i8:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -44,3 +175,5 @@ bb:
 }
 
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index ff305da..78be949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -405,6 +406,63 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -618,6 +676,33 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -719,6 +804,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 3
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 3
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -934,6 +1036,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x2f32:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1039,6 +1169,24 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f32:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1114,19 +1262,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a27, v2
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a28, v0
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a29, v1
-; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v2, s14
-; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s15
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, s14
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, s15
 ; NOLIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
-; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, s0
-; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, s1
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v2, s0
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s1
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a4, v5
 ; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a5, v6
-; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a30, v2
-; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a31, v3
-; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v2, s2
-; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v3, s3
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a30, v0
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a31, v1
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, s2
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, s3
 ; NOLIT-SRCC-NEXT:    s_nop 1
-; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
 ; NOLIT-SRCC-NEXT:    s_nop 7
 ; NOLIT-SRCC-NEXT:    s_nop 7
 ; NOLIT-SRCC-NEXT:    s_nop 1
@@ -1254,19 +1402,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a27, v2
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a28, v0
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a29, v1
-; LIT-SRCC-NEXT:    v_mov_b32_e32 v2, s14
-; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s15
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, s14
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, s15
 ; LIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
-; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, s0
-; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, s1
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v2, s0
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s1
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a4, v5
 ; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a5, v6
-; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a30, v2
-; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a31, v3
-; LIT-SRCC-NEXT:    v_mov_b32_e32 v2, s2
-; LIT-SRCC-NEXT:    v_mov_b32_e32 v3, s3
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a30, v0
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a31, v1
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, s2
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, s3
 ; LIT-SRCC-NEXT:    s_nop 1
-; LIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
 ; LIT-SRCC-NEXT:    s_nop 7
 ; LIT-SRCC-NEXT:    s_nop 7
 ; LIT-SRCC-NEXT:    s_nop 1
@@ -1330,7 +1478,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-LABEL: test_mfma_f32_32x32x4f16:
 ; GFX90A:       ; %bb.0: ; %bb
 ; GFX90A-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_load_dwordx16 s[0:15], s[36:37], 0x40
 ; GFX90A-NEXT:    s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1345,8 +1493,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s18
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s19
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a4, s20
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a5, s21
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s22
@@ -1371,27 +1519,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s13
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s14
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    s_nop 7
 ; GFX90A-NEXT:    s_nop 7
 ; GFX90A-NEXT:    s_nop 2
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: test_mfma_f32_32x32x4f16:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx16 s[0:15], s[36:37], 0x40
 ; GFX942-NEXT:    s_load_dwordx16 s[16:31], s[36:37], 0x0
@@ -1406,8 +1554,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, s18
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s19
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, s0
-; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX942-NEXT:    v_accvgpr_write_b32 a4, s20
 ; GFX942-NEXT:    v_accvgpr_write_b32 a5, s21
 ; GFX942-NEXT:    v_accvgpr_write_b32 a6, s22
@@ -1432,22 +1580,83 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    v_accvgpr_write_b32 a29, s13
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, s14
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
-; GFX942-NEXT:    v_mov_b32_e32 v2, s2
-; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 2
-; GFX942-NEXT:    global_store_dwordx4 v4, a[24:27], s[36:37] offset:96
-; GFX942-NEXT:    global_store_dwordx4 v4, a[28:31], s[36:37] offset:112
-; GFX942-NEXT:    global_store_dwordx4 v4, a[16:19], s[36:37] offset:64
-; GFX942-NEXT:    global_store_dwordx4 v4, a[20:23], s[36:37] offset:80
-; GFX942-NEXT:    global_store_dwordx4 v4, a[8:11], s[36:37] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v4, a[12:15], s[36:37] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v4, a[0:3], s[36:37]
-; GFX942-NEXT:    global_store_dwordx4 v4, a[4:7], s[36:37] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[36:37]
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4f16:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[36:37], 0x40
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[16:31], s[36:37], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[38:39], 0x0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, s0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v35, s1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v36, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v37, s3
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[16:19], s[36:37] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[36:37] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[8:11], s[36:37] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[12:15], s[36:37] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[0:3], s[36:37]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[4:7], s[36:37] offset:16
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -1676,6 +1885,36 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f16:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[20:23], s[18:19], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s20
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s22
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -1752,46 +1991,66 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX90A-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX90A:       ; %bb.0: ; %bb
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s9
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s10
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s11
 ; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    s_nop 4
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, s4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT:    v_mov_b32_e32 v2, s6
-; GFX942-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, s9
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, s10
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s11
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 4
-; GFX942-NEXT:    global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 4
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2021,6 +2280,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[20:23], s[18:19], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s20
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s21
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s22
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s23
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2099,47 +2388,67 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX90A-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX90A:       ; %bb.0: ; %bb
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s7
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s9
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s10
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s11
 ; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    s_nop 7
 ; GFX90A-NEXT:    s_nop 2
-; GFX90A-NEXT:    global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, s4
-; GFX942-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT:    v_mov_b32_e32 v2, s6
-; GFX942-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, s9
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, s10
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s11
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 6
-; GFX942-NEXT:    global_store_dwordx4 v4, a[0:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 6
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %c.1 = load <4 x half>, ptr addrspace(1) %c
@@ -2508,6 +2817,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_32x32x4i8:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2721,6 +3087,33 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 2
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2728,6 +3121,143 @@ bb:
   ret void
 }
 
+define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; NOLIT-SRCC:       ; %bb.0: ; %bb
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a4, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a5, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a6, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a7, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a8, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a9, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a10, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a11, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a12, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a13, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a14, 64
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a15, 64
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
+; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v16, 0
+; NOLIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT:    s_nop 7
+; NOLIT-SRCC-NEXT:    s_nop 1
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v12, a12
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v7, a7
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v6, a6
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v5, a5
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v4, a4
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v11, a11
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v10, a10
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v9, a9
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v8, a8
+; NOLIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; NOLIT-SRCC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; NOLIT-SRCC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; NOLIT-SRCC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT:    s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; LIT-SRCC:       ; %bb.0: ; %bb
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
+; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v16, 0
+; LIT-SRCC-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT:    s_nop 7
+; LIT-SRCC-NEXT:    s_nop 1
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v15, a15
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v14, a14
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v13, a13
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v12, a12
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v7, a7
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v6, a6
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v5, a5
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v4, a4
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v11, a11
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v10, a10
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v9, a9
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v8, a8
+; LIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; LIT-SRCC-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; LIT-SRCC-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; LIT-SRCC-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; LIT-SRCC-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 2
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 2
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
+bb:
+  %in.1 = load <16 x i32>, ptr addrspace(1) %arg
+  %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> splat (i32 64), i32 1, i32 2, i32 3)
+  store <16 x i32> %mai.1, ptr addrspace(1) %arg
+  ret void
+}
+
 define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8:
 ; NOLIT-SRCC:       ; %bb.0: ; %bb
@@ -2822,6 +3352,23 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 4
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -2829,6 +3376,197 @@ bb:
   ret void
 }
 
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; NOLIT-SRCC:       ; %bb.0: ; %bb
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, 1
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, 1
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, 1
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, 1
+; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT:    s_nop 0
+; NOLIT-SRCC-NEXT:    v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT:    s_nop 3
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT:    s_nop 0
+; NOLIT-SRCC-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT:    s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; LIT-SRCC:       ; %bb.0: ; %bb
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 1
+; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 2
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
+; LIT-SRCC-NEXT:    s_nop 0
+; LIT-SRCC-NEXT:    v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT:    s_nop 3
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT:    s_nop 0
+; LIT-SRCC-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; LIT-SRCC-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_nop 3
+; GFX90A-NEXT:    global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 3
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
+bb:
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 1), i32 1, i32 2, i32 3)
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) %arg) #0 {
+; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; NOLIT-SRCC:       ; %bb.0:
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 0x41
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v1, 1
+; NOLIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, v0
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, v0
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, v0
+; NOLIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, v0
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2
+; NOLIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
+; NOLIT-SRCC-NEXT:    s_nop 0
+; NOLIT-SRCC-NEXT:    v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+; NOLIT-SRCC-NEXT:    s_nop 3
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
+; NOLIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a3
+; NOLIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
+; NOLIT-SRCC-NEXT:    s_nop 0
+; NOLIT-SRCC-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; NOLIT-SRCC-NEXT:    s_endpgm
+;
+; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; LIT-SRCC:       ; %bb.0:
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 0x41
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v1, 1
+; LIT-SRCC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a0, v0
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a1, v0
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a2, v0
+; LIT-SRCC-NEXT:    v_accvgpr_write_b32 a3, v0
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v0, 2
+; LIT-SRCC-NEXT:    v_mov_b32_e32 v4, 0
+; LIT-SRCC-NEXT:    s_nop 0
+; LIT-SRCC-NEXT:    v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+; LIT-SRCC-NEXT:    s_nop 3
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v0, a0
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v1, a1
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v2, a2
+; LIT-SRCC-NEXT:    v_accvgpr_read_b32 v3, a3
+; LIT-SRCC-NEXT:    s_waitcnt lgkmcnt(0)
+; LIT-SRCC-NEXT:    s_nop 0
+; LIT-SRCC-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; LIT-SRCC-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x41
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 2
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_nop 3
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x41
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
+; GFX942-VGPR:       ; %bb.0:
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 3
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
+  %in.1 = load <4 x i32>, ptr addrspace(1) %arg
+  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
+  store <4 x i32> %mai.1, ptr addrspace(1) %arg
+  ret void
+}
+
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 {
 ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
 ; NOLIT-SRCC:       ; %bb.0: ; %bb
@@ -3219,6 +3957,64 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[34:35]
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 2.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, s16
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, s17
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, s19
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s20
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s21
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s22
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s23
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s25
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s26
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s27
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s28
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s29
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, s30
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, s31
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v20, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v21, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v22, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v23, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v24, s8
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v25, s9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v26, s10
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, s11
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, s12
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, s13
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, s14
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, s15
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3435,6 +4231,34 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_forward_acc:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, 2.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3542,6 +4366,25 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-NEXT:    s_nop 3
 ; GFX942-NEXT:    global_store_dwordx4 v2, a[0:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_forward_acc:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    s_nop 3
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
@@ -3616,6 +4459,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v1, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm_splat:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
   store <4 x float> %mai.1, ptr addrspace(1) %arg
@@ -3745,6 +4601,22 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
   store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -3885,6 +4757,24 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, 0x40004000
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
   store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -4091,6 +4981,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
   store <32 x float> %mai.1, ptr addrspace(1) %arg
@@ -4175,6 +5086,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
   store <4 x float> %mai.1, ptr addrspace(1) %arg
@@ -4355,6 +5281,36 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, v0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
   store <16 x float> %mai.1, ptr addrspace(1) %arg
@@ -4667,6 +5623,74 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v14, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v20, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v21, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v22, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v23, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v24, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v25, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v26, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
   store <32 x float> %mai.1, ptr addrspace(1) %arg
@@ -4755,6 +5779,24 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -4846,6 +5888,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ; GFX942-NEXT:    s_nop 2
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    s_nop 0
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    s_nop 2
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
@@ -5109,6 +6168,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_vecarg:
+; GFX942-VGPR:       ; %bb.0: ; %bb
+; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v33, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX942-VGPR-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 7
+; GFX942-VGPR-NEXT:    s_nop 1
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 04ee0bb..f78ea92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1485,30 +1485,30 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v12, s0
-; SDAG-NEXT:    v_mov_b32_e32 v13, s1
-; SDAG-NEXT:    v_mov_b32_e32 v14, s2
-; SDAG-NEXT:    v_mov_b32_e32 v15, s3
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    v_mov_b32_e32 v20, s28
-; SDAG-NEXT:    v_mov_b32_e32 v21, s29
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_mov_b32_e32 v8, s24
-; SDAG-NEXT:    v_mov_b32_e32 v9, s25
-; SDAG-NEXT:    v_mov_b32_e32 v10, s26
-; SDAG-NEXT:    v_mov_b32_e32 v11, s27
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v21
+; SDAG-NEXT:    v_mov_b32_e32 v14, s0
+; SDAG-NEXT:    v_mov_b32_e32 v15, s1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s2
+; SDAG-NEXT:    v_mov_b32_e32 v17, s3
+; SDAG-NEXT:    v_mov_b32_e32 v18, s16
+; SDAG-NEXT:    v_mov_b32_e32 v19, s17
+; SDAG-NEXT:    v_mov_b32_e32 v20, s18
+; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
+; SDAG-NEXT:    v_mov_b32_e32 v5, s29
+; SDAG-NEXT:    v_mov_b32_e32 v6, s20
+; SDAG-NEXT:    v_mov_b32_e32 v7, s21
+; SDAG-NEXT:    v_mov_b32_e32 v8, s22
+; SDAG-NEXT:    v_mov_b32_e32 v9, s23
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v4
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v5
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
@@ -1895,7 +1895,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
@@ -1915,16 +1915,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    v_mov_b32_e32 v21, s13
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v16, a[0:3], s[14:15]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[14:15]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s24
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s25
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s26
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s27
-; GISEL-NEXT:    v_mov_b32_e32 v16, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v20, s29
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[30:31]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[30:31]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1964,7 +1962,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
 ; SDAG-NEXT:    s_movk_i32 s6, 0x41
 ; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
@@ -1974,7 +1972,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
@@ -1983,21 +1981,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v20, 0x41
 ; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
@@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2031,7 +2025,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
 ; SDAG-NEXT:    s_movk_i32 s6, 0x41
 ; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
@@ -2041,7 +2035,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
@@ -2050,21 +2044,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v20, 0x41
 ; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
@@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2096,7 +2086,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
@@ -2107,7 +2097,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
@@ -2116,14 +2106,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
@@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2162,7 +2148,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
@@ -2173,7 +2159,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; SDAG-NEXT:    v_mov_b32_e32 v8, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
@@ -2182,14 +2168,12 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    global_store_dwordx4 v16, a[0:3], s[4:5]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[4:5]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
@@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x50
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[4:5]
+; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6
 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
 
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
 attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 91197f9..0b2818f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,26 +3515,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v24, s0
-; SDAG-NEXT:    v_mov_b32_e32 v25, s1
-; SDAG-NEXT:    v_mov_b32_e32 v26, s2
-; SDAG-NEXT:    v_mov_b32_e32 v27, s3
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s28
-; SDAG-NEXT:    v_mov_b32_e32 v33, s29
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b32_e32 v20, s24
-; SDAG-NEXT:    v_mov_b32_e32 v21, s25
-; SDAG-NEXT:    v_mov_b32_e32 v22, s26
-; SDAG-NEXT:    v_mov_b32_e32 v23, s27
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v32
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v33
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v30, s16
+; SDAG-NEXT:    v_mov_b32_e32 v31, s17
+; SDAG-NEXT:    v_mov_b32_e32 v32, s18
+; SDAG-NEXT:    v_mov_b32_e32 v33, s19
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s20
+; SDAG-NEXT:    v_mov_b32_e32 v19, s21
+; SDAG-NEXT:    v_mov_b32_e32 v20, s22
+; SDAG-NEXT:    v_mov_b32_e32 v21, s23
+; SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; SDAG-NEXT:    v_mov_b32_e32 v23, s25
+; SDAG-NEXT:    v_mov_b32_e32 v24, s26
+; SDAG-NEXT:    v_mov_b32_e32 v25, s27
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
 ; SDAG-NEXT:    v_accvgpr_write_b32 a4, v2
@@ -3550,7 +3550,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -3993,34 +3993,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s0
-; SDAG-NEXT:    v_mov_b32_e32 v17, s1
-; SDAG-NEXT:    v_mov_b32_e32 v18, s2
-; SDAG-NEXT:    v_mov_b32_e32 v19, s3
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
-; SDAG-NEXT:    v_mov_b32_e32 v32, s28
-; SDAG-NEXT:    v_mov_b32_e32 v33, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v30
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v31
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v32
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v33
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v30, s16
+; SDAG-NEXT:    v_mov_b32_e32 v31, s17
+; SDAG-NEXT:    v_mov_b32_e32 v32, s18
+; SDAG-NEXT:    v_mov_b32_e32 v33, s19
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    v_mov_b32_e32 v20, s24
+; SDAG-NEXT:    v_mov_b32_e32 v21, s25
+; SDAG-NEXT:    v_mov_b32_e32 v22, s26
+; SDAG-NEXT:    v_mov_b32_e32 v23, s27
+; SDAG-NEXT:    v_mov_b32_e32 v24, s28
+; SDAG-NEXT:    v_mov_b32_e32 v25, s29
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a10, v8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a11, v9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a12, v10
@@ -4028,7 +4028,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x80
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; SDAG-NEXT:    v_mov_b32_e32 v6, s14
-; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    v_mov_b32_e32 v12, s20
-; SDAG-NEXT:    v_mov_b32_e32 v13, s21
-; SDAG-NEXT:    v_mov_b32_e32 v14, s22
-; SDAG-NEXT:    v_mov_b32_e32 v15, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s51
-; SDAG-NEXT:    v_mov_b32_e32 v16, s1
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    v_mov_b32_e32 v20, s12
+; SDAG-NEXT:    v_mov_b32_e32 v21, s13
+; SDAG-NEXT:    v_mov_b32_e32 v22, s14
+; SDAG-NEXT:    v_mov_b32_e32 v23, s15
+; SDAG-NEXT:    v_mov_b32_e32 v24, s16
+; SDAG-NEXT:    v_mov_b32_e32 v25, s17
+; SDAG-NEXT:    v_mov_b32_e32 v26, s18
+; SDAG-NEXT:    v_mov_b32_e32 v27, s19
+; SDAG-NEXT:    v_mov_b32_e32 v28, s20
+; SDAG-NEXT:    v_mov_b32_e32 v29, s21
+; SDAG-NEXT:    v_mov_b32_e32 v30, s22
+; SDAG-NEXT:    v_mov_b32_e32 v31, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT:    v_mov_b32_e32 v32, s1
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd:
@@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x80
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s51
-; GISEL-NEXT:    v_mov_b32_e32 v16, s1
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
+; GISEL-NEXT:    v_mov_b32_e32 v32, s1
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
-; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[2:3]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[2:3] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[2:3] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[2:3] offset:48
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
   store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; SDAG-NEXT:    s_movk_i32 s2, 0x41
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x80
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; SDAG-NEXT:    v_mov_b32_e32 v6, s14
-; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s36
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    v_mov_b32_e32 v12, s20
-; SDAG-NEXT:    v_mov_b32_e32 v13, s21
-; SDAG-NEXT:    v_mov_b32_e32 v14, s22
-; SDAG-NEXT:    v_mov_b32_e32 v15, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s37
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s38
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s39
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s40
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s41
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s42
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s43
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s44
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s45
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s46
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s47
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s48
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s49
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s50
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s51
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    v_mov_b32_e32 v20, s12
+; SDAG-NEXT:    v_mov_b32_e32 v21, s13
+; SDAG-NEXT:    v_mov_b32_e32 v22, s14
+; SDAG-NEXT:    v_mov_b32_e32 v23, s15
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; SDAG-NEXT:    v_mov_b32_e32 v24, s16
+; SDAG-NEXT:    v_mov_b32_e32 v25, s17
+; SDAG-NEXT:    v_mov_b32_e32 v26, s18
+; SDAG-NEXT:    v_mov_b32_e32 v27, s19
+; SDAG-NEXT:    v_mov_b32_e32 v28, s20
+; SDAG-NEXT:    v_mov_b32_e32 v29, s21
+; SDAG-NEXT:    v_mov_b32_e32 v30, s22
+; SDAG-NEXT:    v_mov_b32_e32 v31, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b32_e32 v16, 0x41
+; GISEL-NEXT:    v_mov_b32_e32 v32, 0x41
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x80
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s36
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s37
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s38
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s39
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s40
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s41
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s42
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s43
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s44
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s45
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s46
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s47
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s48
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s49
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s50
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s51
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2)
   store <16 x float> %result, ptr addrspace(1) %ptr, align 64
@@ -4735,26 +4703,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; SDAG-NEXT:    v_mov_b32_e32 v5, s17
-; SDAG-NEXT:    v_mov_b32_e32 v6, s18
-; SDAG-NEXT:    v_mov_b32_e32 v7, s19
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b32_e32 v2, s12
+; SDAG-NEXT:    v_mov_b32_e32 v3, s13
+; SDAG-NEXT:    v_mov_b32_e32 v4, s14
+; SDAG-NEXT:    v_mov_b32_e32 v5, s15
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v14, s24
+; SDAG-NEXT:    v_mov_b32_e32 v15, s25
+; SDAG-NEXT:    v_mov_b32_e32 v16, s26
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v17, s27
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -4770,45 +4738,44 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
 ; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
 ; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT:    v_mov_b32_e32 v16, s1
+; SDAG-NEXT:    v_mov_b32_e32 v0, s1
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mov_b32_e32 v2, s20
+; SDAG-NEXT:    v_mov_b32_e32 v3, s21
+; SDAG-NEXT:    v_mov_b32_e32 v4, s22
+; SDAG-NEXT:    v_mov_b32_e32 v5, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v6, s18
+; SDAG-NEXT:    v_mov_b32_e32 v7, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s16
+; SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v8, s14
+; SDAG-NEXT:    v_mov_b32_e32 v9, s15
+; SDAG-NEXT:    v_mov_b32_e32 v6, s12
+; SDAG-NEXT:    v_mov_b32_e32 v7, s13
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SDAG-NEXT:    v_mov_b32_e32 v11, s11
+; SDAG-NEXT:    v_mov_b32_e32 v8, s8
+; SDAG-NEXT:    v_mov_b32_e32 v9, s9
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -4922,42 +4889,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s20
+; SDAG-NEXT:    v_mov_b32_e32 v3, s21
+; SDAG-NEXT:    v_mov_b32_e32 v4, s22
+; SDAG-NEXT:    v_mov_b32_e32 v5, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v6, s18
+; SDAG-NEXT:    v_mov_b32_e32 v7, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s16
+; SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v8, s14
+; SDAG-NEXT:    v_mov_b32_e32 v9, s15
+; SDAG-NEXT:    v_mov_b32_e32 v6, s12
+; SDAG-NEXT:    v_mov_b32_e32 v7, s13
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 16
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SDAG-NEXT:    v_mov_b32_e32 v11, s11
+; SDAG-NEXT:    v_mov_b32_e32 v8, s8
+; SDAG-NEXT:    v_mov_b32_e32 v9, s9
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 0
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5033,78 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; SDAG-NEXT:    v_mov_b32_e32 v5, s17
-; SDAG-NEXT:    v_mov_b32_e32 v6, s18
-; SDAG-NEXT:    v_mov_b32_e32 v7, s19
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    v_mov_b32_e32 v36, s16
+; SDAG-NEXT:    v_mov_b32_e32 v37, s17
+; SDAG-NEXT:    v_mov_b32_e32 v38, s18
+; SDAG-NEXT:    v_mov_b32_e32 v39, s19
+; SDAG-NEXT:    v_mov_b32_e32 v40, s20
+; SDAG-NEXT:    v_mov_b32_e32 v41, s21
+; SDAG-NEXT:    v_mov_b32_e32 v42, s22
+; SDAG-NEXT:    v_mov_b32_e32 v43, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v44, s24
+; SDAG-NEXT:    v_mov_b32_e32 v45, s25
+; SDAG-NEXT:    v_mov_b32_e32 v46, s26
+; SDAG-NEXT:    v_mov_b32_e32 v47, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5112,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT:    v_accvgpr_write_b32 a31, s23
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT:    v_accvgpr_write_b32 a30, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a29, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a28, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a27, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a26, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a25, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a24, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a23, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a22, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a21, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a20, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a19, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a18, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a17, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a16, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[42:43]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[44:45]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[46:47]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[48:49]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[50:51]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 3
-; GISEL-NEXT:    global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5180,78 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; SDAG-NEXT:    v_mov_b32_e32 v5, s17
-; SDAG-NEXT:    v_mov_b32_e32 v6, s18
-; SDAG-NEXT:    v_mov_b32_e32 v7, s19
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; SDAG-NEXT:    v_mov_b32_e32 v21, s17
+; SDAG-NEXT:    v_mov_b32_e32 v22, s18
+; SDAG-NEXT:    v_mov_b32_e32 v23, s19
+; SDAG-NEXT:    v_mov_b32_e32 v24, s20
+; SDAG-NEXT:    v_mov_b32_e32 v25, s21
+; SDAG-NEXT:    v_mov_b32_e32 v26, s22
+; SDAG-NEXT:    v_mov_b32_e32 v27, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s24
+; SDAG-NEXT:    v_mov_b32_e32 v29, s25
+; SDAG-NEXT:    v_mov_b32_e32 v30, s26
+; SDAG-NEXT:    v_mov_b32_e32 v31, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 48
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 16
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5259,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], 32
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[44:45]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[50:51]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 3
-; GISEL-NEXT:    global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
@@ -6302,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6
 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2
 
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
 attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
 attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 7193fee..31a48de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -1,22 +1,54 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
 
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
 
-; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x8xf32:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 a[0:3], v[4:5], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v2, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 2.0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0x40400000
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -24,17 +56,82 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
-; GFX942-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
-; GFX942-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
-; GFX942-DAG:  v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
-; GFX942-DAG:  v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
-; GFX942:      v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GISEL:       v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
-; GCN-NOT:     v_accvgpr_read_b32
-; GCN:         global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
+; GFX942-SDAG-LABEL: test_mfma_f32_32x32x4xf32:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 1.0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 2.0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_mov_b32 s18, 1.0
+; GFX942-GISEL-NEXT:    s_mov_b32 s19, 2.0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX942-GISEL-NEXT:    s_mov_b32 s18, 0x40400000
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s19, 4.0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a4, s4
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a5, s5
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a6, s6
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a7, s7
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a8, s8
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a9, s9
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a10, s10
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a11, s11
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a12, s12
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a13, s13
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a14, s14
+; GFX942-GISEL-NEXT:    v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <16 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -43,3 +140,5 @@ bb:
 }
 
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 77d4aad..b25fe83 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; GISEL-NEXT:    global_load_dwordx4 v[14:17], v0, s[6:7]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s16
+; GISEL-NEXT:    v_mov_b32_e32 v12, s16
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[14:17], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -120,25 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_mov_b32_e32 v10, s0
+; SDAG-NEXT:    v_mov_b32_e32 v11, s1
+; SDAG-NEXT:    v_mov_b32_e32 v12, s2
+; SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; SDAG-NEXT:    v_mov_b32_e32 v2, s16
+; SDAG-NEXT:    v_mov_b32_e32 v3, s17
+; SDAG-NEXT:    v_mov_b32_e32 v4, s18
+; SDAG-NEXT:    v_mov_b32_e32 v5, s19
+; SDAG-NEXT:    v_mov_b32_e32 v6, s20
+; SDAG-NEXT:    v_mov_b32_e32 v7, s21
+; SDAG-NEXT:    v_mov_b32_e32 v8, s22
+; SDAG-NEXT:    v_mov_b32_e32 v9, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v0, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -624,25 +624,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, s0
-; GCN-NEXT:    v_mov_b32_e32 v9, s1
-; GCN-NEXT:    v_mov_b32_e32 v10, s2
-; GCN-NEXT:    v_mov_b32_e32 v11, s3
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NEXT:    v_mov_b32_e32 v4, s20
-; GCN-NEXT:    v_mov_b32_e32 v5, s21
-; GCN-NEXT:    v_mov_b32_e32 v6, s22
-; GCN-NEXT:    v_mov_b32_e32 v7, s23
+; GCN-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s16
+; GCN-NEXT:    v_mov_b32_e32 v3, s17
+; GCN-NEXT:    v_mov_b32_e32 v4, s18
+; GCN-NEXT:    v_mov_b32_e32 v5, s19
+; GCN-NEXT:    v_mov_b32_e32 v6, s20
+; GCN-NEXT:    v_mov_b32_e32 v7, s21
+; GCN-NEXT:    v_mov_b32_e32 v8, s22
+; GCN-NEXT:    v_mov_b32_e32 v9, s23
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s24
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s25
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s26
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s27
-; GCN-NEXT:    v_mov_b32_e32 v12, s28
+; GCN-NEXT:    v_mov_b32_e32 v0, s28
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -887,24 +887,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT:    global_load_dwordx4 v[14:17], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s2
+; GISEL-NEXT:    v_mov_b32_e32 v12, s2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[14:17], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -964,25 +964,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_mov_b32_e32 v10, s0
+; SDAG-NEXT:    v_mov_b32_e32 v11, s1
+; SDAG-NEXT:    v_mov_b32_e32 v12, s2
+; SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; SDAG-NEXT:    v_mov_b32_e32 v2, s16
+; SDAG-NEXT:    v_mov_b32_e32 v3, s17
+; SDAG-NEXT:    v_mov_b32_e32 v4, s18
+; SDAG-NEXT:    v_mov_b32_e32 v5, s19
+; SDAG-NEXT:    v_mov_b32_e32 v6, s20
+; SDAG-NEXT:    v_mov_b32_e32 v7, s21
+; SDAG-NEXT:    v_mov_b32_e32 v8, s22
+; SDAG-NEXT:    v_mov_b32_e32 v9, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v0, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1429,24 +1429,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT:    global_load_dwordx4 v[14:17], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s2
+; GISEL-NEXT:    v_mov_b32_e32 v12, s2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[14:17], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1506,25 +1506,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_mov_b32_e32 v10, s0
+; SDAG-NEXT:    v_mov_b32_e32 v11, s1
+; SDAG-NEXT:    v_mov_b32_e32 v12, s2
+; SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; SDAG-NEXT:    v_mov_b32_e32 v2, s16
+; SDAG-NEXT:    v_mov_b32_e32 v3, s17
+; SDAG-NEXT:    v_mov_b32_e32 v4, s18
+; SDAG-NEXT:    v_mov_b32_e32 v5, s19
+; SDAG-NEXT:    v_mov_b32_e32 v6, s20
+; SDAG-NEXT:    v_mov_b32_e32 v7, s21
+; SDAG-NEXT:    v_mov_b32_e32 v8, s22
+; SDAG-NEXT:    v_mov_b32_e32 v9, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v0, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1598,24 +1598,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT:    global_load_dwordx4 v[14:17], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s2
+; GISEL-NEXT:    v_mov_b32_e32 v12, s2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[14:17], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1675,25 +1675,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_mov_b32_e32 v10, s0
+; SDAG-NEXT:    v_mov_b32_e32 v11, s1
+; SDAG-NEXT:    v_mov_b32_e32 v12, s2
+; SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; SDAG-NEXT:    v_mov_b32_e32 v2, s16
+; SDAG-NEXT:    v_mov_b32_e32 v3, s17
+; SDAG-NEXT:    v_mov_b32_e32 v4, s18
+; SDAG-NEXT:    v_mov_b32_e32 v5, s19
+; SDAG-NEXT:    v_mov_b32_e32 v6, s20
+; SDAG-NEXT:    v_mov_b32_e32 v7, s21
+; SDAG-NEXT:    v_mov_b32_e32 v8, s22
+; SDAG-NEXT:    v_mov_b32_e32 v9, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v0, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1767,24 +1767,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT:    global_load_dwordx4 v[14:17], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s2
+; GISEL-NEXT:    v_mov_b32_e32 v12, s2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[14:17], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1844,25 +1844,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_mov_b32_e32 v10, s0
+; SDAG-NEXT:    v_mov_b32_e32 v11, s1
+; SDAG-NEXT:    v_mov_b32_e32 v12, s2
+; SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; SDAG-NEXT:    v_mov_b32_e32 v2, s16
+; SDAG-NEXT:    v_mov_b32_e32 v3, s17
+; SDAG-NEXT:    v_mov_b32_e32 v4, s18
+; SDAG-NEXT:    v_mov_b32_e32 v5, s19
+; SDAG-NEXT:    v_mov_b32_e32 v6, s20
+; SDAG-NEXT:    v_mov_b32_e32 v7, s21
+; SDAG-NEXT:    v_mov_b32_e32 v8, s22
+; SDAG-NEXT:    v_mov_b32_e32 v9, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v0, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1936,24 +1936,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
+; GISEL-NEXT:    global_load_dwordx4 v[14:17], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s2
+; GISEL-NEXT:    v_mov_b32_e32 v12, s2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[14:17], s[0:1]
 ; GISEL-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2013,25 +2013,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_mov_b32_e32 v10, s0
+; SDAG-NEXT:    v_mov_b32_e32 v11, s1
+; SDAG-NEXT:    v_mov_b32_e32 v12, s2
+; SDAG-NEXT:    v_mov_b32_e32 v13, s3
+; SDAG-NEXT:    v_mov_b32_e32 v2, s16
+; SDAG-NEXT:    v_mov_b32_e32 v3, s17
+; SDAG-NEXT:    v_mov_b32_e32 v4, s18
+; SDAG-NEXT:    v_mov_b32_e32 v5, s19
+; SDAG-NEXT:    v_mov_b32_e32 v6, s20
+; SDAG-NEXT:    v_mov_b32_e32 v7, s21
+; SDAG-NEXT:    v_mov_b32_e32 v8, s22
+; SDAG-NEXT:    v_mov_b32_e32 v9, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
+; SDAG-NEXT:    v_mov_b32_e32 v0, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
   ret <16 x float> %result
 }
 
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
index 5b2de59..393581f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
@@ -121,18 +121,7 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:    v_max_num_f32_e32 v0, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %src0.ext = fpext bfloat %src0 to float
   %src1.ext = fpext bfloat %src1 to float
@@ -150,20 +139,9 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX1250-NEXT:    v_max_num_f32_e32 v0, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:    s_wait_storecnt 0x0
-; GFX1250-NEXT:    global_store_b16 v[0:1], v1, off scope:SCOPE_SYS
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %src0.ext = fpext bfloat %src0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
index 557080a..1b2eb83 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -75,15 +75,8 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:    v_max_num_f32_e32 v0, 0, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_min_num_f32_e32 v0, 1.0, v0
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %src0.ext = fpext bfloat %src0 to float
   %src1.ext = fpext bfloat %src1 to float
@@ -199,9 +192,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, 0
-; GFX1250-NEXT:    v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
   %src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
@@ -219,16 +211,13 @@ define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bflo
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1250-NEXT:    v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1]
-; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v6, 0
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_pk_max_num_bf16 v2, v0, 0
-; GFX1250-NEXT:    v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v1, v1 clamp
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-NEXT:    v_pk_min_num_bf16 v1, v2, 1.0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
   %src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
@@ -261,11 +250,8 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v2, v3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, 0
-; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
-; GFX1250-NEXT:    v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v1, v1 clamp
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
   %src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
@@ -291,15 +277,7 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x b
 ; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX1250-NEXT:    v_max_num_f32_e32 v1, 0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_min_num_f32_e32 v1, 1.0, v1
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v0, v0 clamp
 ; GFX1250-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
@@ -328,14 +306,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x b
 ; GFX1250-NEXT:    v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX1250-NEXT:    v_max_num_f32_e32 v1, 0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_min_num_f32_e32 v1, 1.0, v1
-; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT:    v_pk_max_num_bf16 v1, v1, v1 clamp
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 9cc42ac..be02045 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -9851,8 +9851,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
 ; CHECK-NEXT:    s_movk_i32 s4, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s5, -1
 ; CHECK-NEXT:  .LBB8_5: ; %memmove_bwd_loop
@@ -11167,8 +11167,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_andn2_saveexec_b32 s6, s6
 ; ALIGNED-NEXT:    s_cbranch_execz .LBB8_6
 ; ALIGNED-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
 ; ALIGNED-NEXT:    s_movk_i32 s4, 0xf800
 ; ALIGNED-NEXT:    s_mov_b32 s5, -1
 ; ALIGNED-NEXT:  .LBB8_5: ; %memmove_bwd_loop
@@ -12381,8 +12381,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
 ; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
 ; UNROLL3-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
-; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 0x7b0, v0
 ; UNROLL3-NEXT:    v_add_nc_u32_e32 v1, 0x7b0, v1
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 0x7b0, v0
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
 ; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index bc25084..5e5e3bf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
@@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
 ; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
@@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
 ; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(3) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 6763957..f7aaa3e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -1,15 +1,148 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vgpr:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_vgpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v0, s16
+; GFX908-NEXT:    v_mov_b32_e32 v1, s17
+; GFX908-NEXT:    v_mov_b32_e32 v2, s18
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s21
+; GFX908-NEXT:    v_mov_b32_e32 v1, s22
+; GFX908-NEXT:    v_mov_b32_e32 v2, s23
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s24
+; GFX908-NEXT:    v_mov_b32_e32 v1, s25
+; GFX908-NEXT:    v_mov_b32_e32 v2, s26
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s27
+; GFX908-NEXT:    v_mov_b32_e32 v1, s28
+; GFX908-NEXT:    v_mov_b32_e32 v2, s29
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s30
+; GFX908-NEXT:    v_mov_b32_e32 v1, s31
+; GFX908-NEXT:    v_mov_b32_e32 v2, s0
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s1
+; GFX908-NEXT:    v_mov_b32_e32 v1, s2
+; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s4
+; GFX908-NEXT:    v_mov_b32_e32 v1, s5
+; GFX908-NEXT:    v_mov_b32_e32 v2, s6
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s7
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    v_mov_b32_e32 v2, s9
+; GFX908-NEXT:    v_mov_b32_e32 v3, s19
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s10
+; GFX908-NEXT:    v_mov_b32_e32 v1, s11
+; GFX908-NEXT:    v_mov_b32_e32 v2, s12
+; GFX908-NEXT:    v_mov_b32_e32 v5, s20
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v1, s14
+; GFX908-NEXT:    v_mov_b32_e32 v2, s15
+; GFX908-NEXT:    v_mov_b32_e32 v3, 1.0
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -17,9 +150,142 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_agpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v0, s16
+; GFX908-NEXT:    v_mov_b32_e32 v1, s17
+; GFX908-NEXT:    v_mov_b32_e32 v2, s18
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s21
+; GFX908-NEXT:    v_mov_b32_e32 v1, s22
+; GFX908-NEXT:    v_mov_b32_e32 v2, s23
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s24
+; GFX908-NEXT:    v_mov_b32_e32 v1, s25
+; GFX908-NEXT:    v_mov_b32_e32 v2, s26
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s27
+; GFX908-NEXT:    v_mov_b32_e32 v1, s28
+; GFX908-NEXT:    v_mov_b32_e32 v2, s29
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s30
+; GFX908-NEXT:    v_mov_b32_e32 v1, s31
+; GFX908-NEXT:    v_mov_b32_e32 v2, s0
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s1
+; GFX908-NEXT:    v_mov_b32_e32 v1, s2
+; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s4
+; GFX908-NEXT:    v_mov_b32_e32 v1, s5
+; GFX908-NEXT:    v_mov_b32_e32 v2, s6
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s7
+; GFX908-NEXT:    v_mov_b32_e32 v1, s8
+; GFX908-NEXT:    v_mov_b32_e32 v2, s9
+; GFX908-NEXT:    v_mov_b32_e32 v3, s19
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s10
+; GFX908-NEXT:    v_mov_b32_e32 v1, s11
+; GFX908-NEXT:    v_mov_b32_e32 v2, s12
+; GFX908-NEXT:    v_mov_b32_e32 v5, s20
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, s13
+; GFX908-NEXT:    v_mov_b32_e32 v1, s14
+; GFX908-NEXT:    v_mov_b32_e32 v2, s15
+; GFX908-NEXT:    v_mov_b32_e32 v3, 1.0
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT:    s_endpgm
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -27,9 +293,105 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v32, 0
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ; def a0
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT:    s_endpgm
 bb:
   %acc = call i32 asm sideeffect "; def $0", "={a0}"()
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -38,9 +400,105 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v32, 0
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ; use a[100:131]
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT:    s_endpgm
 bb:
   call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison)
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -49,10 +507,105 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v32, 0
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ; def v0
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT:    s_endpgm
 bb:
   %acc = call i32 asm sideeffect "; def $0", "={v0}"()
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -61,9 +614,127 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_call:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT:    s_mov_b32 s38, -1
+; GFX908-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX908-NEXT:    s_add_u32 s36, s36, s11
+; GFX908-NEXT:    s_addc_u32 s37, s37, 0
+; GFX908-NEXT:    s_mov_b32 s12, s8
+; GFX908-NEXT:    s_add_u32 s8, s4, 44
+; GFX908-NEXT:    s_mov_b32 s13, s9
+; GFX908-NEXT:    s_addc_u32 s9, s5, 0
+; GFX908-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT:    s_getpc_b64 s[4:5]
+; GFX908-NEXT:    s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; GFX908-NEXT:    s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; GFX908-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT:    s_mov_b32 s14, s10
+; GFX908-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX908-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX908-NEXT:    s_mov_b32 s32, 0
+; GFX908-NEXT:    v_mov_b32_e32 v40, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT:    global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[24:27], v40, s[34:35] offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[16:19], v40, s[34:35] offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[8:11], v40, s[34:35] offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[0:3], v40, s[34:35]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT:    global_store_dwordx4 v40, v[0:3], s[34:35] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    global_store_dwordx4 v40, v[4:7], s[34:35] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v40, v[8:11], s[34:35] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v40, v[12:15], s[34:35] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v40, v[16:19], s[34:35] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v40, v[20:23], s[34:35] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v40, v[24:27], s[34:35]
+; GFX908-NEXT:    global_store_dwordx4 v40, v[0:3], s[34:35] offset:16
+; GFX908-NEXT:    s_endpgm
 bb:
   call void @foo()
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -75,10 +746,173 @@ bb:
 ; We could avoid scan to find calls since we see these during lowering before selection.
 ; However, in SDag lowering and selection is done block by block, so it would only work
 ; in Global ISel.
-
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
+; GFX908:       ; %bb.0: ; %bb1
+; GFX908-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT:    s_mov_b32 s54, -1
+; GFX908-NEXT:    s_mov_b32 s55, 0xe00000
+; GFX908-NEXT:    s_add_u32 s52, s52, s11
+; GFX908-NEXT:    s_mov_b32 s14, s10
+; GFX908-NEXT:    s_mov_b32 s12, s8
+; GFX908-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GFX908-NEXT:    v_mov_b32_e32 v6, 1.0
+; GFX908-NEXT:    v_mov_b32_e32 v7, 0
+; GFX908-NEXT:    s_addc_u32 s53, s53, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx16 s[36:51], s[6:7], 0x0
+; GFX908-NEXT:    s_load_dwordx16 s[16:31], s[6:7], 0x40
+; GFX908-NEXT:    s_bitcmp0_b32 s8, 0
+; GFX908-NEXT:    s_mov_b32 s32, 0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, s36
+; GFX908-NEXT:    v_mov_b32_e32 v4, s37
+; GFX908-NEXT:    v_mov_b32_e32 v5, s40
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, s38
+; GFX908-NEXT:    v_mov_b32_e32 v4, s39
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, s41
+; GFX908-NEXT:    v_mov_b32_e32 v4, s42
+; GFX908-NEXT:    v_mov_b32_e32 v5, s43
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s44
+; GFX908-NEXT:    v_mov_b32_e32 v4, s45
+; GFX908-NEXT:    v_mov_b32_e32 v5, s46
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s47
+; GFX908-NEXT:    v_mov_b32_e32 v4, s48
+; GFX908-NEXT:    v_mov_b32_e32 v5, s49
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s50
+; GFX908-NEXT:    v_mov_b32_e32 v4, s51
+; GFX908-NEXT:    v_mov_b32_e32 v5, s16
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s17
+; GFX908-NEXT:    v_mov_b32_e32 v4, s18
+; GFX908-NEXT:    v_mov_b32_e32 v5, s19
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
+; GFX908-NEXT:    v_mov_b32_e32 v4, s21
+; GFX908-NEXT:    v_mov_b32_e32 v5, s22
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s23
+; GFX908-NEXT:    v_mov_b32_e32 v4, s24
+; GFX908-NEXT:    v_mov_b32_e32 v5, s25
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s26
+; GFX908-NEXT:    v_mov_b32_e32 v4, s27
+; GFX908-NEXT:    v_mov_b32_e32 v5, s28
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, s29
+; GFX908-NEXT:    v_mov_b32_e32 v4, s30
+; GFX908-NEXT:    v_mov_b32_e32 v5, s31
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v5
+; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a24
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7] offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a28
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7] offset:112
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a16
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7] offset:64
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a20
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7] offset:80
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a8
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7] offset:32
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a12
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7] offset:48
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a4
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v7, v[3:6], s[6:7] offset:16
+; GFX908-NEXT:    s_cbranch_scc1 .LBB6_2
+; GFX908-NEXT:  ; %bb.1: ; %bb2
+; GFX908-NEXT:    s_add_u32 s8, s4, 48
+; GFX908-NEXT:    s_mov_b32 s13, s9
+; GFX908-NEXT:    s_addc_u32 s9, s5, 0
+; GFX908-NEXT:    s_getpc_b64 s[4:5]
+; GFX908-NEXT:    s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; GFX908-NEXT:    s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; GFX908-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; GFX908-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT:  .LBB6_2: ; %bb3
+; GFX908-NEXT:    s_endpgm
 bb1:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -94,10 +928,101 @@ bb3:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_noagpr:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
 define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v6
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v7
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v9
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v10
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v11
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v12
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v13
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v14
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v15
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v16
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v18
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v19
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v20
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v21
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v22
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v23
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v24
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v25
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v26
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v27
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v28
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v29
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v30
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v31
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v32
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v33
+; GFX908-NEXT:    v_mov_b32_e32 v2, 1.0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:112
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off offset:64
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off offset:80
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off offset:32
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[22:25], off offset:48
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[26:29], off
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -105,9 +1030,101 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_with_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
 define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX908-NEXT:    global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX908-NEXT:    global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX908-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX908-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v6
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v7
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v8
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v9
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v10
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v11
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v12
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v13
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v14
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v15
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v16
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v18
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v19
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v20
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v21
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v22
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v23
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v24
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v25
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v26
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v27
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v28
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v29
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v30
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v31
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v32
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v33
+; GFX908-NEXT:    v_mov_b32_e32 v2, 1.0
+; GFX908-NEXT:    v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a27
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a31
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:112
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off offset:64
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[14:17], off offset:80
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off offset:32
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[22:25], off offset:48
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[26:29], off
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %in.1 = load <32 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -121,3 +1138,6 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2
 attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
 attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" }
 attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX90A: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 9c38d7f..5b0d2d2 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -3565,13 +3565,13 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
 ; PACKED-SDAG-LABEL: fneg_v2f32_scalar:
 ; PACKED-SDAG:       ; %bb.0:
 ; PACKED-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; PACKED-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    s_xor_b32 s3, s3, 0x80000000
 ; PACKED-SDAG-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; PACKED-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; PACKED-SDAG-NEXT:    v_mov_b32_e32 v1, s3
-; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; PACKED-SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; PACKED-SDAG-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; PACKED-SDAG-NEXT:    s_endpgm
 ;
 ; PACKED-GISEL-LABEL: fneg_v2f32_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index a5c8f04..f54a383 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
 ;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX908 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,1 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
 ;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s
 
 ; Partial reg copy and spill missed during regalloc handled later at frame lowering.
@@ -12,17 +12,21 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
   ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %7
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %25
+  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %25
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %27
+  ; REGALLOC-GFX908-NEXT:   SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+  ; REGALLOC-GFX908-NEXT:   [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX908-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
-  ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX908-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX908-LABEL: name: partial_copy
@@ -57,40 +61,35 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
   ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %7
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %24
+  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %24
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %22
+  ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:av_64_align2 = COPY %22
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
+  ; REGALLOC-GFX90A-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX90A-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX90A-LABEL: name: partial_copy
   ; PEI-GFX90A: bb.0 (%ir-block.0):
-  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+  ; PEI-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; PEI-GFX90A-NEXT: {{  $}}
-  ; PEI-GFX90A-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX90A-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-  ; PEI-GFX90A-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
-  ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
-  ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
+  ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   S_ENDPGM 0
   call void asm sideeffect "; use $0", "a" (i32 poison)
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index d48bfe0..68ef30a9 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -53,31 +53,31 @@ define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, s20
-; GFX942-NEXT:    v_mov_b32_e32 v1, s21
-; GFX942-NEXT:    v_mov_b32_e32 v2, s22
-; GFX942-NEXT:    v_mov_b32_e32 v3, s23
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX942-NEXT:    v_mov_b32_e32 v2, s20
+; GFX942-NEXT:    v_mov_b32_e32 v3, s21
+; GFX942-NEXT:    v_mov_b32_e32 v4, s22
+; GFX942-NEXT:    v_mov_b32_e32 v5, s23
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mov_b32_e32 v0, s16
-; GFX942-NEXT:    v_mov_b32_e32 v1, s17
-; GFX942-NEXT:    v_mov_b32_e32 v2, s18
-; GFX942-NEXT:    v_mov_b32_e32 v3, s19
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942-NEXT:    v_mov_b32_e32 v2, s16
+; GFX942-NEXT:    v_mov_b32_e32 v3, s17
+; GFX942-NEXT:    v_mov_b32_e32 v4, s18
+; GFX942-NEXT:    v_mov_b32_e32 v5, s19
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mov_b32_e32 v0, s12
-; GFX942-NEXT:    v_mov_b32_e32 v1, s13
-; GFX942-NEXT:    v_mov_b32_e32 v2, s14
-; GFX942-NEXT:    v_mov_b32_e32 v3, s15
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT:    v_mov_b32_e32 v2, s12
+; GFX942-NEXT:    v_mov_b32_e32 v3, s13
+; GFX942-NEXT:    v_mov_b32_e32 v4, s14
+; GFX942-NEXT:    v_mov_b32_e32 v5, s15
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mov_b32_e32 v0, s8
-; GFX942-NEXT:    v_mov_b32_e32 v1, s9
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-NEXT:    v_mov_b32_e32 v3, s9
+; GFX942-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942-NEXT:    v_mov_b32_e32 v5, s11
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-NEXT:    s_endpgm
 entry:
   store <16 x i32> %a, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll
new file mode 100644
index 0000000..2324f3f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/read-write-register-illegal-type.ll
@@ -0,0 +1,29 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.read_register with illegal type
+define amdgpu_kernel void @test_read_register_i9(ptr addrspace(1) %out) nounwind {
+  %reg = call i9 @llvm.read_register.i9(metadata !0)
+  store i9 %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.write_register with illegal type
+define amdgpu_kernel void @test_write_register_i9(ptr addrspace(1) %out) nounwind {
+ call void @llvm.write_register.i9(metadata !0, i9 42)
+ ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.read_register with illegal type
+define amdgpu_kernel void @test_read_register_i128(ptr addrspace(1) %out) nounwind {
+  %reg = call i128 @llvm.read_register.i128(metadata !0)
+  store i128 %reg, ptr addrspace(1) %out
+  ret void
+}
+
+; CHECK: error: <unknown>:0:0: cannot use llvm.write_register with illegal type
+define amdgpu_kernel void @test_write_register_i128(ptr addrspace(1) %out) nounwind {
+ call void @llvm.write_register.i128(metadata !0, i128 42)
+ ret void
+}
+
+!0 = !{!"m0"}
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
index f2fd3a8..c035e9f 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll
@@ -9,9 +9,9 @@
 %asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> }
 
 ; CHECK-LABEL: {{^}}illegal_eviction_assert:
-; CHECK: ; def v[4:19] v[20:27] v[0:4] v[0:3] a[0:15]
+; CHECK: ; def v[13:28] v[0:7] v[8:12] v[0:3] a[0:15]
 ; CHECK: ; clobber
-; CHECK: ; use v[4:19] v[20:27] v[0:4] v[0:3] a[1:16]
+; CHECK: ; use v[13:28] v[0:7] v[8:12] v[0:3] a[1:16]
 define void @illegal_eviction_assert(ptr addrspace(1) %arg) #0 {
   ;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"()
   %asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"()
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index e8e122e..bbb9df9 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -59,19 +59,19 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX90A-LABEL: scalar_to_vector_v8i16:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; GFX90A-NEXT:    s_endpgm
 entry:
   %val.1.i32 = extractelement <2 x i32> %in, i64 0
@@ -146,19 +146,19 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX90A-LABEL: scalar_to_vector_v8f16:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s1
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; GFX90A-NEXT:    s_endpgm
 entry:
   %val.1.float = extractelement <2 x float> %in, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
index 9361187..4d864ad 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll
@@ -25,27 +25,27 @@ define void @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy(ptr addrspace(1)
 ; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v4, v5
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    ;;#ASMSTART
 ; GFX940-NEXT:    ; def v4, v5
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_mov_b32_e32 v0, v5
-; GFX940-NEXT:    v_mov_b32_e32 v1, v4
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -214,27 +214,27 @@ define void @shufflevector_v2i32_11_physreg_even_vgpr_pair_copy(ptr addrspace(1)
 ; GFX90A-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v4, v5
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v5
+; GFX90A-NEXT:    global_store_dwordx2 v0, v[2:3], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    ;;#ASMSTART
 ; GFX940-NEXT:    ; def v4, v5
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_mov_b32_e32 v0, v5
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
-; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v3, v5
+; GFX940-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"()
@@ -265,31 +265,31 @@ define void @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy(ptr addrspace(
 ; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v4, v5, v6, v7
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v7
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
-; GFX90A-NEXT:    global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v10, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v8, v7
+; GFX90A-NEXT:    v_mov_b32_e32 v11, v4
+; GFX90A-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v8, 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    ;;#ASMSTART
 ; GFX940-NEXT:    ; def v4, v5, v6, v7
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_mov_b32_e32 v2, v5
-; GFX940-NEXT:    v_mov_b32_e32 v1, v6
-; GFX940-NEXT:    v_mov_b32_e32 v0, v7
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
-; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v10, v5
+; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v8, v7
+; GFX940-NEXT:    v_mov_b32_e32 v11, v4
+; GFX940-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -327,31 +327,31 @@ define void @shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy(ptr addrspace(
 ; GFX90A-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v4, v5, v6, v7
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v7
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
-; GFX90A-NEXT:    global_store_dwordx4 v8, v[0:3], s[16:17]
+; GFX90A-NEXT:    v_mov_b32_e32 v8, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v11, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v10, v7
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v4
+; GFX90A-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v8, 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    ;;#ASMSTART
 ; GFX940-NEXT:    ; def v4, v5, v6, v7
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_mov_b32_e32 v0, v5
-; GFX940-NEXT:    v_mov_b32_e32 v3, v6
-; GFX940-NEXT:    v_mov_b32_e32 v2, v7
-; GFX940-NEXT:    v_mov_b32_e32 v1, v4
-; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v8, v5
+; GFX940-NEXT:    v_mov_b32_e32 v11, v6
+; GFX940-NEXT:    v_mov_b32_e32 v10, v7
+; GFX940-NEXT:    v_mov_b32_e32 v9, v4
+; GFX940-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"()
@@ -746,16 +746,15 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
 ; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v4, v5, v6, v7
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v5
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v7
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
-; GFX90A-NEXT:    global_store_dwordx4 v8, v[0:3], s[16:17]
-; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mov_b32_e32 v10, v5
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v8, v7
+; GFX90A-NEXT:    v_mov_b32_e32 v11, v4
+; GFX90A-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
@@ -763,17 +762,16 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p
 ; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v8, 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX940-NEXT:    ;;#ASMSTART
 ; GFX940-NEXT:    ; def v4, v5, v6, v7
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_mov_b32_e32 v2, v5
-; GFX940-NEXT:    v_mov_b32_e32 v1, v6
-; GFX940-NEXT:    v_mov_b32_e32 v0, v7
-; GFX940-NEXT:    v_mov_b32_e32 v3, v4
-; GFX940-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
-; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_mov_b32_e32 v10, v5
+; GFX940-NEXT:    v_mov_b32_e32 v9, v6
+; GFX940-NEXT:    v_mov_b32_e32 v8, v7
+; GFX940-NEXT:    v_mov_b32_e32 v11, v4
+; GFX940-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 5484f77..eb0d546 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,15 +1,107 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s
 
-; GCN-LABEL: {{^}}max_12regs_13a_used:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
-; GCN-NOT: buffer_store_dword
-; GCN-NOT: buffer_load_dword
-; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
-; GCN:     ScratchSize: 0
 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
+; GFX908-LABEL: max_12regs_13a_used:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX908-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v5, s8
+; GFX908-NEXT:    v_mov_b32_e32 v1, s9
+; GFX908-NEXT:    v_mov_b32_e32 v2, s10
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT:    v_mov_b32_e32 v5, s11
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX908-NEXT:    v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3]
+; GFX908-NEXT:    s_cbranch_scc0 .LBB0_2
+; GFX908-NEXT:  ; %bb.1: ; %st
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_endpgm
+; GFX908-NEXT:  .LBB0_2: ; %use
+; GFX908-NEXT:    s_nop 2
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a4
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a6
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, 4
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, 5
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, 1
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, 2
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, 3
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: max_12regs_13a_used:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, s11
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX90A-NEXT:    v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3]
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB0_2
+; GFX90A-NEXT:  ; %bb.1: ; %st
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_endpgm
+; GFX90A-NEXT:  .LBB0_2: ; %use
+; GFX90A-NEXT:    s_nop 3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a7
+; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a6
+; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a5
+; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 3
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v9
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
+; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
@@ -28,16 +120,64 @@ st:
   call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
   ret void
 }
-
-; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN:     v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
-; GCN-NOT: buffer_store_dword
-; GCN-NOT: buffer_load_dword
-; GCN:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
 ; GCN:     ScratchSize: 0
+
 define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
+; GFX908-LABEL: max_10_vgprs_used_9a:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v1
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v5
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: max_10_vgprs_used_9a:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v5
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_endpgm
   %a1 = call <4 x i32> asm sideeffect "", "=a"()
   %a2 = call <4 x i32> asm sideeffect "", "=a"()
   %a3 = call i32 asm sideeffect "", "=a"()
@@ -46,17 +186,168 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
   call void asm sideeffect "", "a"(<2 x i32> %a4)
   ret void
 }
-
-; GCN-LABEL: {{^}}max_32regs_mfma32:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN-NOT: buffer_store_dword
-; GCN:     v_accvgpr_read_b32
-; GCN:     v_mfma_f32_32x32x1f32
-; GCN-NOT: buffer_load_dword
-; GCN:     v_accvgpr_write_b32
 ; GCN:     ScratchSize: 0
+
 define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
+; GFX908-LABEL: max_32regs_mfma32:
+; GFX908:       ; %bb.0: ; %bb
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x40e00000
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v2
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41000000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41100000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41200000
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41300000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41400000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41500000
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41600000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41700000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41880000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41900000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41980000
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41a00000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41a80000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41b00000
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41b80000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41c00000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41c80000
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41d00000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41d80000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41e00000
+; GFX908-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v4
+; GFX908-NEXT:    v_mov_b32_e32 v2, 0x41e80000
+; GFX908-NEXT:    v_mov_b32_e32 v3, 0x41f00000
+; GFX908-NEXT:    v_mov_b32_e32 v4, 0x41f80000
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, 2.0
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, 4.0
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v4
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, 2.0
+; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v5
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: max_32regs_mfma32:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x40c00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x40e00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41000000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41100000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41300000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41400000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41600000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41700000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41880000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41900000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41980000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41a00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41a80000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41b00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41b80000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41c00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41c80000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41d00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41d80000
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41e00000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41e80000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41f00000
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 2.0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41f80000
+; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 1.0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 4.0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a1
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    global_store_dword v0, a0, s[2:3]
+; GFX90A-NEXT:    s_endpgm
 bb:
   %v = call i32 asm sideeffect "", "=a"()
   br label %use
@@ -68,42 +359,110 @@ use:
   store float %elt1, ptr addrspace(1) %arg
   ret void
 }
+; GCN:     ScratchSize: 0
 
 ; Should spill agprs to memory for both gfx908 and gfx90a.
-; GCN-LABEL: {{^}}max_6regs_used_8a:
-; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-
-; GFX908-DAG:  v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
-; GFX908-DAG:  v_accvgpr_read_b32 v5, a1 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX908-DAG:  v_accvgpr_read_b32 v5, a2 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX908-DAG:  v_accvgpr_read_b32 v5, a3 ; Reload Reuse
-; GFX908-DAG:  buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-
-; GFX90A-DAG:  buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX90A-DAG:  buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-
-; GCN:  v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
-
-; GFX908-DAG:  buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
-; GFX908-DAG:  buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
-; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
-
-; GFX90A-DAG:  buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
-; GFX90A-DAG:  buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A:  global_store_dwordx4 v[0:1], v[2:5], off
-
-; GCN: ScratchSize: 20
 define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
+; GFX908-LABEL: max_6regs_used_8a:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT:    s_mov_b32 s6, -1
+; GFX908-NEXT:    s_mov_b32 s7, 0xe00000
+; GFX908-NEXT:    s_add_u32 s4, s4, s3
+; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ; def v1
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ; def a[0:3]
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_addc_u32 s5, s5, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a1 ; Reload Reuse
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a2 ; Reload Reuse
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a3 ; Reload Reuse
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX908-NEXT:    s_waitcnt vmcnt(4)
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX908-NEXT:    s_nop 3
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX908-NEXT:    buffer_load_dword v0, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v3, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT:    ;;#ASMSTART
+; GFX908-NEXT:    ; use v0
+; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: max_6regs_used_8a:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT:    s_mov_b32 s6, -1
+; GFX90A-NEXT:    s_mov_b32 s7, 0xe00000
+; GFX90A-NEXT:    s_add_u32 s4, s4, s3
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_addc_u32 s5, s5, 0
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def v1
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; def a[0:3]
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    buffer_store_dword a0, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_store_dword a1, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a2, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a3, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dwordx4 a[0:3], v0, s[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 1.0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
+; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
+; GFX90A-NEXT:    buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    ;;#ASMSTART
+; GFX90A-NEXT:    ; use v1
+; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %v0 = call float asm sideeffect "; def $0", "=v"()
   %a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
@@ -115,6 +474,7 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
   call void asm sideeffect "; use $0", "v"(float %v0);
   ret void
 }
+; GCN: ScratchSize: 20
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -125,3 +485,5 @@ attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgp
 attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index 053038d..382d892 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -1,14 +1,116 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
 
 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-; FUNC-LABEL: {{^}}ssubo_i64_zext:
 define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
+; SI-LABEL: ssubo_i64_zext:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_sub_u32 s10, s2, s8
+; SI-NEXT:    s_subb_u32 s11, s3, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; SI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[8:9], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s10, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ssubo_i64_zext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    s_sub_u32 s6, s2, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    s_subb_u32 s7, s3, s5
+; VI-NEXT:    v_cmp_gt_i64_e64 s[8:9], s[4:5], 0
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_xor_b64 s[0:1], s[8:9], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: ssubo_i64_zext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_sub_u32 s4, s2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_subb_u32 s5, s3, s7
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[8:9], s[6:7], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[8:9], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ssubo_i64_zext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_sub_u32 s4, s2, s6
+; GFX10-NEXT:    s_subb_u32 s5, s3, s7
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT:    s_xor_b32 s2, s6, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ssubo_i64_zext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_sub_u32 s6, s2, s4
+; GFX11-NEXT:    s_subb_u32 s7, s3, s5
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s4, s[4:5], 0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
+; GFX11-NEXT:    s_xor_b32 s2, s4, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT:    v_add_co_u32 v0, s2, s6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s7, 0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -18,8 +120,102 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_ssubo_i32:
 define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind {
+; SI-LABEL: s_ssubo_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_sub_i32 s12, s8, s9
+; SI-NEXT:    s_cmp_gt_i32 s9, 0
+; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; SI-NEXT:    s_cmp_lt_i32 s12, s8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_xor_b64 s[4:5], s[10:11], s[8:9]
+; SI-NEXT:    s_mov_b32 s0, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ssubo_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_i32 s6, s4, s5
+; VI-NEXT:    s_cmp_gt_i32 s5, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    s_cmp_lt_i32 s6, s4
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s6
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    flat_store_byte v[2:3], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_sub_i32 s4, s6, s7
+; GFX9-NEXT:    v_sub_i32 v1, s6, v1 clamp
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_sub_nc_i32 v0, s6, s7 clamp
+; GFX10-NEXT:    s_sub_i32 s4, s6, s7
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
+; GFX10-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_i32 v0, s6, s7 clamp
+; GFX11-NEXT:    s_sub_i32 s4, s6, s7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
+; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
@@ -28,8 +224,112 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_ssubo_i32:
 define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s4
+; SI-NEXT:    s_mov_b32 s13, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
+; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v0
+; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ssubo_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    flat_load_dword v4, v[0:1]
+; VI-NEXT:    flat_load_dword v5, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, v4, v5
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v6, v4
+; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    flat_store_dword v[0:1], v6
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    flat_store_byte v[2:3], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX9-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_i32 v3, v1, v2 clamp
+; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_nc_i32 v3, v1, v2 clamp
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_nc_i32 v3, v1, v2 clamp
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %aptr, align 4
   %b = load i32, ptr addrspace(1) %bptr, align 4
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -40,10 +340,109 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_ssubo_i64:
-; GCN: s_sub_u32
-; GCN: s_subb_u32
 define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind {
+; SI-LABEL: s_ssubo_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_sub_u32 s12, s4, s6
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_subb_u32 s13, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
+; SI-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[6:7], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    s_mov_b32 s0, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ssubo_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    flat_store_byte v[2:3], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_ssubo_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[14:15], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    global_store_byte v2, v0, s[10:11]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_ssubo_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_sub_u32 s0, s12, s14
+; GFX10-NEXT:    s_subb_u32 s1, s13, s15
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[14:15], 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[12:13]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_xor_b32 s0, s2, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_ssubo_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_sub_u32 s8, s4, s6
+; GFX11-NEXT:    s_subb_u32 s9, s5, s7
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s6, s[6:7], 0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s8
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT:    s_xor_b32 s4, s6, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -52,16 +451,121 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_ssubo_i64:
-; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
-; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc,
-; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
-
-; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
-; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
 define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s4
+; SI-NEXT:    s_mov_b32 s13, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; SI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ssubo_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT:    v_subb_u32_e32 v9, vcc, v1, v3, vcc
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
+; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    flat_store_byte v[6:7], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_ssubo_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    global_store_byte v6, v0, s[10:11]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_ssubo_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX10-NEXT:    global_store_byte v6, v0, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_ssubo_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[8:9]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[10:11]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[4:5]
+; GFX11-NEXT:    global_store_b8 v6, v0, s[6:7]
+; GFX11-NEXT:    s_endpgm
   %a = load i64, ptr addrspace(1) %aptr, align 4
   %b = load i64, ptr addrspace(1) %bptr, align 4
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
@@ -72,14 +576,134 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_ssubo_v2i32:
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_cmp_lt_i32
-; SICIVI: v_sub_{{[iu]}}32
 define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+; SI-LABEL: v_ssubo_v2i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s4
+; SI-NEXT:    s_mov_b32 s13, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_sub_i32_e32 v5, vcc, v1, v3
+; SI-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 0, v3
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
+; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v4, v0
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ssubo_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v1, v3
+; VI-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
+; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 0, v3
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v1
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
+; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v8, v0
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_ssubo_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[12:13]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[14:15]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v5, v1, v3
+; GFX9-NEXT:    v_sub_i32 v1, v1, v3 clamp
+; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v2
+; GFX9-NEXT:    v_sub_i32 v0, v0, v2 clamp
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[10:11]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_ssubo_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[12:13]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v1, v3
+; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v3 clamp
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, v0, v2
+; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v2 clamp
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_ssubo_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[0:1], v5, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v5, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v1, v3
+; GFX11-NEXT:    v_sub_nc_i32 v1, v1, v3 clamp
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v0, v2
+; GFX11-NEXT:    v_sub_nc_i32 v0, v0, v2 clamp
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v5, v[3:4], s[0:1]
+; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[2:3]
+; GFX11-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
   %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
new file mode 100644
index 0000000..42436a1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
+
+
+%pair = type { i32, i32 }
+
+define void @test_extractvalue_then_else(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_then_else:
+; GFX900:       ; %bb.0: ; %if
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_dword v3, v[0:1]
+; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execz .LBB0_2
+; GFX900-NEXT:  ; %bb.1: ; %else
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT:  .LBB0_2: ; %Flow
+; GFX900-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_store_dword v[0:1], v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+if:
+  %load_then = load %pair, ptr %ptr
+  br i1 %cond, label %then, label %else
+
+then:
+  %a_then = extractvalue %pair %load_then, 0
+  br label %merge
+
+else:
+  %a_else = extractvalue %pair %load_then, 0
+  %sum_else = add i32 %a_else, 1
+  br label %merge
+
+merge:
+  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+define void @test_extractvalue_else_then(ptr %ptr, i1 %cond) {
+; GFX900-LABEL: test_extractvalue_else_then:
+; GFX900:       ; %bb.0: ; %if
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_dword v3, v[0:1]
+; GFX900-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX900-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX900-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX900-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT:    s_cbranch_execz .LBB1_2
+; GFX900-NEXT:  ; %bb.1: ; %else
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_u32_e32 v3, 1, v3
+; GFX900-NEXT:  .LBB1_2: ; %merge
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_store_dword v[0:1], v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+if:
+  %load_then = load %pair, ptr %ptr
+  br i1 %cond, label %else, label %then
+
+else:
+  %a_else = extractvalue %pair %load_then, 0
+  %sum_else = add i32 %a_else, 1
+  br label %merge
+
+then:
+  %a_then = extractvalue %pair %load_then, 0
+  br label %merge
+
+merge:
+  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 {
+; GFX900-LABEL: test_loop_with_if:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:    s_movk_i32 s10, 0xfe
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX900-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v2, s1
+; GFX900-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; GFX900-NEXT:    v_mov_b32_e32 v1, s0
+; GFX900-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v3
+; GFX900-NEXT:    s_branch .LBB2_2
+; GFX900-NEXT:  .LBB2_1: ; %latch
+; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_u32_e32 v5, 20, v3
+; GFX900-NEXT:    v_cmp_lt_i32_e32 vcc, s10, v5
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    flat_store_dword v[1:2], v3
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execz .LBB2_8
+; GFX900-NEXT:  .LBB2_2: ; %loop
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    flat_load_dwordx2 v[3:4], v[1:2]
+; GFX900-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    s_cbranch_vccnz .LBB2_4
+; GFX900-NEXT:  ; %bb.3: ; %if
+; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT:    v_cmp_gt_i32_e32 vcc, 11, v5
+; GFX900-NEXT:    s_andn2_b64 s[8:9], s[2:3], exec
+; GFX900-NEXT:    s_and_b64 s[12:13], vcc, exec
+; GFX900-NEXT:    s_mov_b64 s[6:7], -1
+; GFX900-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX900-NEXT:  .LBB2_4: ; %Flow
+; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT:    s_and_saveexec_b64 s[12:13], s[8:9]
+; GFX900-NEXT:    s_xor_b64 s[8:9], exec, s[12:13]
+; GFX900-NEXT:    s_cbranch_execz .LBB2_6
+; GFX900-NEXT:  ; %bb.5: ; %else
+; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX900-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; GFX900-NEXT:  .LBB2_6: ; %Flow1
+; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX900-NEXT:    s_cbranch_execz .LBB2_1
+; GFX900-NEXT:  ; %bb.7: ; %then
+; GFX900-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX900-NEXT:    flat_store_dword v[1:2], v0
+; GFX900-NEXT:    s_branch .LBB2_1
+; GFX900-NEXT:  .LBB2_8: ; %end
+; GFX900-NEXT:    s_endpgm
+entry:
+  %a = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %loop
+
+loop:
+  %entry_phi = phi i32 [ 0, %entry ], [ %a15, %latch ]
+  %load = load %pair, ptr %ptr
+  br i1 %cond, label %if, label %else
+
+if:
+  %cmp = icmp sgt i32 %entry_phi, 10
+  br i1 %cmp, label %then, label %else
+
+then:
+  %a_then = extractvalue %pair %load, 0
+  store i32 %a, ptr %ptr, align 4
+  br label %latch
+
+else:
+  %a2 = extractvalue %pair %load, 1
+  %y = extractvalue %pair %load, 0
+  %a_else = add i32 %y, %a2
+  br label %latch
+
+latch:
+  %a_test = phi i32 [ %a_then, %then ], [ %a_else, %else ]
+  store i32 %a_test, ptr  %ptr
+  %a15 = add nsw i32 %a_test, 20
+  %a16 = icmp slt i32  %a15, 255
+  br i1 %a16, label %loop, label %end
+
+end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d23e314..f6c357d 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s8, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s8, 0
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s9, 1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s9, 1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[70:71], 1, v3
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS1-NEXT:    s_mov_b32 s82, s16
@@ -84,7 +84,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr58_vgpr59
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 2
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 3
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 4
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 5
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 4
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 5
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s4, 6
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s5, 7
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s4, 6
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s5, 7
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s70, 8
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s71, 9
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s70, 8
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s71, 9
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 7
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
@@ -120,7 +120,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_29
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
@@ -128,7 +128,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS1-NEXT:    flat_load_dword v58, v[46:47]
+; GLOBALNESS1-NEXT:    flat_load_dword v56, v[46:47]
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -186,10 +186,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s8, 10
-; GLOBALNESS1-NEXT:    v_writelane_b32 v59, s9, 11
-; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s8, 10
+; GLOBALNESS1-NEXT:    v_writelane_b32 v57, s9, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v57, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v57, 3
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
@@ -198,7 +198,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v56
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -228,8 +228,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v59, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v59, 1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v57, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v57, 1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
@@ -265,7 +265,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s83
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s82
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[44:45], v[58:59], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
@@ -277,13 +277,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow23
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[4:7], s[38:39], 0x0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v59, 8
-; GLOBALNESS1-NEXT:    v_readlane_b32 s8, v59, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v57, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s8, v57, 10
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v59, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v57, 9
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_mov_b32 s55, s7
-; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v59, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s9, v57, 11
 ; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[52:53]
@@ -291,8 +291,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v59, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v57, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v57, 5
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb69.i
@@ -384,12 +384,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[8:9], 1, v1
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr59 : SGPR spill to VGPR lane
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr57 : SGPR spill to VGPR lane
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s8, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s8, 0
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[68:69], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s9, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s9, 1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[84:85], 1, v3
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v46, 0x80
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s16
@@ -398,7 +398,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v47, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
-; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr56_vgpr57
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr58_vgpr59
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -407,24 +407,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 2
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 3
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v3
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 4
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 5
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s4, 6
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s5, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s4, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s5, 7
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[80:81], 1, v1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s84, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s85, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s84, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s85, 9
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 7
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
@@ -434,7 +434,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow28
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_29
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
@@ -442,7 +442,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[46:47]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
-; GLOBALNESS0-NEXT:    flat_load_dword v58, v[46:47]
+; GLOBALNESS0-NEXT:    flat_load_dword v56, v[46:47]
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
@@ -500,10 +500,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s8, 10
-; GLOBALNESS0-NEXT:    v_writelane_b32 v59, s9, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s8, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v57, s9, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v57, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v57, 3
 ; GLOBALNESS0-NEXT:    s_mov_b32 s83, s55
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
@@ -513,7 +513,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[42:43], off
 ; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v56
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
@@ -543,8 +543,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v59, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v59, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v57, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v57, 1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
 ; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
@@ -580,7 +580,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[56:57], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[44:45], v[58:59], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
@@ -591,12 +591,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow23
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v59, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v59, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v57, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s8, v57, 10
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b32 s55, s83
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v59, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v59, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v57, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s9, v57, 11
 ; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[52:53]
@@ -604,8 +604,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v59, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v59, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v57, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v57, 5
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb69.i
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index d230ff5..e1574dc 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
 
 define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 ; SI-LABEL: s_uaddo_i64_zext:
@@ -12,14 +14,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_add_u32 s0, s2, s8
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_addc_u32 s1, s3, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -61,6 +63,40 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64_zext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s4, s2, s6
+; GFX10-NEXT:    s_addc_u32 s5, s3, s7
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64_zext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s4, s2, s4
+; GFX11-NEXT:    s_addc_u32 s5, s3, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -76,21 +112,21 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-LABEL: s_uaddo_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s10, s6
-; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s9
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_mov_b32 s8, s2
-; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s12, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
+; SI-NEXT:    s_mov_b32 s0, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_uaddo_i32:
@@ -121,6 +157,34 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v1, s4, s6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v1, s4, s6, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -137,17 +201,15 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -193,6 +255,38 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX9-NEXT:    global_store_byte v0, v2, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +309,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -283,6 +375,45 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_store_byte v0, v2, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i32_novcc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i32_novcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v1, s4, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +437,21 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_add_u32 s6, s4, s6
-; SI-NEXT:    s_addc_u32 s7, s5, s7
-; SI-NEXT:    s_mov_b32 s14, s10
-; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s12, s2
-; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_addc_u32 s7, s5, s7
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s0, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_uaddo_i64:
@@ -359,6 +490,37 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
 ; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_uaddo_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_add_u32 s0, s12, s14
+; GFX10-NEXT:    s_addc_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_uaddo_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_add_u32 s6, s4, s6
+; GFX11-NEXT:    s_addc_u32 s7, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -375,17 +537,15 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -393,8 +553,8 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -437,6 +597,42 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT:    global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +655,15 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -477,8 +671,8 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +716,42 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_short v0, v2, s[8:9]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_uaddo_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT:    v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    global_store_short v0, v2, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_uaddo_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +774,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -606,6 +834,42 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_uaddo_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_uaddo_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v1, s4, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    v_add_co_u32 v0, s4, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
   %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +887,27 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; SI-NEXT:    s_cmp_eq_u32 s0, s1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_cbranch_scc1 .LBB8_2
 ; SI-NEXT:  ; %bb.1: ; %if
 ; SI-NEXT:    s_xor_b64 s[0:1], vcc, -1
 ; SI-NEXT:  .LBB8_2: ; %exit
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_mov_b32 s8, s6
-; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_uaddo_clamp_bit:
@@ -687,6 +952,45 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_store_dword v1, v0, s[8:9]
 ; GFX9-NEXT:    global_store_byte v1, v2, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_uaddo_clamp_bit:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, s1, s2, s3
+; GFX10-NEXT:    s_cmp_eq_u32 s2, s3
+; GFX10-NEXT:    s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_xor_b32 s0, s1, -1
+; GFX10-NEXT:  .LBB8_2: ; %exit
+; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT:    global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_uaddo_clamp_bit:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, s1, s2, s3
+; GFX11-NEXT:    s_cmp_eq_u32 s2, s3
+; GFX11-NEXT:    s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_xor_b32 s0, s1, -1
+; GFX11-NEXT:  .LBB8_2: ; %exit
+; GFX11-NEXT:    s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT:    s_endpgm
 entry:
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -711,19 +1015,19 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_mov_b32 s14, s2
-; SI-NEXT:    s_mov_b32 s15, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s8
 ; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s12, s10
-; SI-NEXT:    s_mov_b32 s13, s11
+; SI-NEXT:    s_mov_b32 s8, s10
+; SI-NEXT:    s_mov_b32 s9, s11
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e64 v0, s[0:1], v1, v2
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT:    s_mov_b64 s[8:9], 0
+; SI-NEXT:    v_add_i32_e64 v0, s[0:1], v1, v2
 ; SI-NEXT:    s_cbranch_vccnz .LBB9_2
 ; SI-NEXT:  ; %bb.1: ; %if
 ; SI-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
@@ -786,6 +1090,50 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_uaddo_clamp_bit:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_add_co_u32 v1, s1, v1, v2
+; GFX10-NEXT:    s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_xor_b32 s0, s1, -1
+; GFX10-NEXT:  .LBB9_2: ; %exit
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_uaddo_clamp_bit:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_add_co_u32 v1, s5, v1, v2
+; GFX11-NEXT:    s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_xor_b32 s4, s5, -1
+; GFX11-NEXT:  .LBB9_2: ; %exit
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -813,23 +1161,23 @@ exit:
 define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) {
 ; SI-LABEL: sv_uaddo_i128:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; SI-NEXT:    v_mov_b32_e32 v6, s1
-; SI-NEXT:    v_mov_b32_e32 v7, s2
-; SI-NEXT:    v_mov_b32_e32 v8, s3
-; SI-NEXT:    s_mov_b32 s4, s6
-; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
-; SI-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
-; SI-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3]
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; SI-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, s2
+; SI-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, s3
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
+; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_and_b32_e32 v2, 1, v2
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    s_endpgm
@@ -871,6 +1219,41 @@ define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: sv_uaddo_i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sv_uaddo_i128:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX11-NEXT:    v_mov_b16_e32 v2.l, v6.l
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
   %uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b)
   %carry = extractvalue { i128, i1 } %uadd, 1
   %carry.ext = zext i1 %carry to i32
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d0d1ba8..b3166fa 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -8,9 +8,8 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT:    v_mov_b32_e32 v40, v0
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT:    flat_load_dword v42, v[0:1]
+; CHECK-NEXT:    v_pk_mov_b32 v[44:45], 0, 0
+; CHECK-NEXT:    flat_load_dword v42, v[44:45]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; CHECK-NEXT:    s_mov_b64 s[48:49], s[4:5]
 ; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x8
@@ -19,48 +18,44 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v46, s6
-; CHECK-NEXT:    v_mov_b32_e32 v47, s7
+; CHECK-NEXT:    v_accvgpr_write_b32 a32, s6
+; CHECK-NEXT:    v_accvgpr_write_b32 a33, s7
 ; CHECK-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; CHECK-NEXT:    s_cmp_lg_u32 s64, -1
 ; CHECK-NEXT:    s_cselect_b32 s7, s7, 0
 ; CHECK-NEXT:    s_cselect_b32 s8, s64, 0
 ; CHECK-NEXT:    s_add_u32 s50, s34, 48
 ; CHECK-NEXT:    s_addc_u32 s51, s35, 0
-; CHECK-NEXT:    v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT:    v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, G@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, G@gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[54:55], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v57, s7
+; CHECK-NEXT:    v_mov_b32_e32 v47, s7
 ; CHECK-NEXT:    s_mov_b32 s7, s6
 ; CHECK-NEXT:    s_mov_b32 s53, s14
-; CHECK-NEXT:    v_accvgpr_write_b32 a33, v1
-; CHECK-NEXT:    v_mov_b32_e32 v56, s8
-; CHECK-NEXT:    v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT:    v_mov_b32_e32 v46, s8
+; CHECK-NEXT:    v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[50:51]
 ; CHECK-NEXT:    s_mov_b32 s12, s14
 ; CHECK-NEXT:    s_mov_b32 s13, s15
 ; CHECK-NEXT:    s_mov_b32 s14, s16
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_mov_b32 s33, s16
 ; CHECK-NEXT:    s_mov_b32 s52, s15
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT:    v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT:    flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT:    v_mov_b32_e32 v40, v0
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[58:59]
 ; CHECK-NEXT:    ; kill: def $sgpr15 killed $sgpr15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT:    flat_load_dwordx2 v[62:63], v[58:59]
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a32
-; CHECK-NEXT:    v_mov_b32_e32 v44, 0
-; CHECK-NEXT:    v_mov_b32_e32 v45, 0x3ff00000
-; CHECK-NEXT:    v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT:    flat_load_dwordx2 v[60:61], v[56:57]
+; CHECK-NEXT:    v_mov_b32_e32 v62, 0
+; CHECK-NEXT:    v_mov_b32_e32 v63, 0x3ff00000
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[48:49]
 ; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[50:51]
@@ -69,20 +64,20 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    s_mov_b32 s13, s52
 ; CHECK-NEXT:    s_mov_b32 s14, s33
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[44:45]
-; CHECK-NEXT:    flat_store_dwordx2 v[58:59], v[60:61]
+; CHECK-NEXT:    flat_store_dwordx2 v[44:45], v[62:63]
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[58:59]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ; kill: def $sgpr15 killed $sgpr15
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT:    flat_load_dwordx2 v[0:1], v[56:57] glc
+; CHECK-NEXT:    flat_load_dwordx2 v[0:1], v[46:47] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s64
 ; CHECK-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT:    flat_store_dwordx2 v[58:59], v[62:63]
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], v[60:61]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[58:59], v[46:47]
-; CHECK-NEXT:    buffer_store_dword v47, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen
+; CHECK-NEXT:    flat_store_dwordx2 v[56:57], a[32:33]
+; CHECK-NEXT:    buffer_store_dword a33, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v62, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 7d7f1b4..0289dab 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11
 
 define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
 ; SI-LABEL: s_usubo_i64_zext:
@@ -13,14 +14,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_sub_u32 s0, s2, s8
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_subb_u32 s1, s3, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -62,6 +63,40 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64_zext:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_sub_u32 s4, s2, s6
+; GFX10-NEXT:    s_subb_u32 s5, s3, s7
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v0, s2, s4, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64_zext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_sub_u32 s4, s2, s4
+; GFX11-NEXT:    s_subb_u32 s5, s3, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], s[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, s2, s4, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -76,21 +111,21 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-LABEL: s_usubo_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s10, s6
-; SI-NEXT:    s_mov_b32 s11, s7
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s9
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_mov_b32 s8, s2
-; SI-NEXT:    s_mov_b32 s9, s3
-; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; SI-NEXT:    s_mov_b32 s0, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
 ; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_usubo_i32:
@@ -121,6 +156,34 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    global_store_byte v0, v2, s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_usubo_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v1, s4, s6, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_usubo_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v1, s4, s6, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -137,17 +200,15 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -193,6 +254,38 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX9-NEXT:    global_store_byte v0, v2, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -215,17 +308,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -283,6 +374,45 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_store_byte v0, v2, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_usubo_i32_novcc:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v1, s0, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_usubo_i32_novcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v1, s4, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
@@ -306,21 +436,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_sub_u32 s6, s4, s6
-; SI-NEXT:    s_subb_u32 s7, s5, s7
-; SI-NEXT:    s_mov_b32 s14, s10
-; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s12, s2
-; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    s_subb_u32 s7, s5, s7
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_mov_b32 s0, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_usubo_i64:
@@ -359,6 +489,37 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
 ; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_usubo_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_sub_u32 s0, s12, s14
+; GFX10-NEXT:    s_subb_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_byte v2, v3, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_usubo_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_sub_u32 s6, s4, s6
+; GFX11-NEXT:    s_subb_u32 s7, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -375,17 +536,15 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -393,8 +552,8 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v2, vcc, v0, v2
 ; SI-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -437,6 +596,42 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    global_store_byte v4, v0, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_usubo_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[8:9]
+; GFX10-NEXT:    global_store_byte v4, v0, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_usubo_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT:    global_store_b8 v4, v0, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
@@ -459,17 +654,15 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
-; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -477,8 +670,8 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -522,6 +715,42 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_short v0, v2, s[8:9]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_usubo_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v1, v2
+; GFX10-NEXT:    v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    global_store_short v0, v2, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v1, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_usubo_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v1, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v3, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v1, s[2:3]
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
@@ -544,17 +773,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
-; SI-NEXT:    s_mov_b32 s18, s10
-; SI-NEXT:    s_mov_b32 s19, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s12, s4
 ; SI-NEXT:    s_mov_b32 s13, s5
-; SI-NEXT:    s_mov_b32 s16, s6
-; SI-NEXT:    s_mov_b32 s17, s7
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[16:19], 0
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s4, s2
@@ -606,6 +833,42 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_usubo_v2i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[12:13]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v1, s0, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_usubo_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v1, s4, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    v_sub_co_u32 v0, s4, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
   %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
@@ -623,26 +886,27 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; SI-NEXT:    s_cmp_eq_u32 s0, s1
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_cbranch_scc1 .LBB8_2
 ; SI-NEXT:  ; %bb.1: ; %if
 ; SI-NEXT:    s_xor_b64 s[0:1], vcc, -1
 ; SI-NEXT:  .LBB8_2: ; %exit
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_mov_b32 s8, s6
-; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_usubo_clamp_bit:
@@ -687,6 +951,45 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_store_dword v1, v0, s[8:9]
 ; GFX9-NEXT:    global_store_byte v1, v2, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: s_usubo_clamp_bit:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v0, s1, s2, s3
+; GFX10-NEXT:    s_cmp_eq_u32 s2, s3
+; GFX10-NEXT:    s_cbranch_scc1 .LBB8_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_xor_b32 s0, s1, -1
+; GFX10-NEXT:  .LBB8_2: ; %exit
+; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX10-NEXT:    global_store_byte v1, v2, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: s_usubo_clamp_bit:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x34
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v0, s1, s2, s3
+; GFX11-NEXT:    s_cmp_eq_u32 s2, s3
+; GFX11-NEXT:    s_cbranch_scc1 .LBB8_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_xor_b32 s0, s1, -1
+; GFX11-NEXT:  .LBB8_2: ; %exit
+; GFX11-NEXT:    s_load_b128 s[4:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_store_b8 v1, v2, s[6:7]
+; GFX11-NEXT:    s_endpgm
 entry:
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
@@ -712,19 +1015,19 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    s_mov_b32 s14, s2
-; SI-NEXT:    s_mov_b32 s15, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s8
 ; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s12, s10
-; SI-NEXT:    s_mov_b32 s13, s11
+; SI-NEXT:    s_mov_b32 s8, s10
+; SI-NEXT:    s_mov_b32 s9, s11
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
+; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
+; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_sub_i32_e64 v0, s[0:1], v1, v2
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; SI-NEXT:    s_mov_b64 s[8:9], 0
+; SI-NEXT:    v_sub_i32_e64 v0, s[0:1], v1, v2
 ; SI-NEXT:    s_cbranch_vccnz .LBB9_2
 ; SI-NEXT:  ; %bb.1: ; %if
 ; SI-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
@@ -787,6 +1090,50 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_usubo_clamp_bit:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dword v1, v0, s[12:13]
+; GFX10-NEXT:    global_load_dword v2, v0, s[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_sub_co_u32 v1, s1, v1, v2
+; GFX10-NEXT:    s_cbranch_vccnz .LBB9_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_xor_b32 s0, s1, -1
+; GFX10-NEXT:  .LBB9_2: ; %exit
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_usubo_clamp_bit:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_sub_co_u32 v1, s5, v1, v2
+; GFX11-NEXT:    s_cbranch_vccnz .LBB9_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_xor_b32 s4, s5, -1
+; GFX11-NEXT:  .LBB9_2: ; %exit
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 2f25a93..fe7def8a 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1961,16 +1961,15 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
 ; GFX942-LABEL: shuffle_v6f16_452367:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT:    global_load_dword v3, v[4:5], off
+; GFX942-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT:    global_load_dword v4, v[2:3], off
+; GFX942-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
+; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f16_452367:
@@ -5151,16 +5150,15 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace
 ; GFX942-LABEL: shuffle_v6bf16_452367:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-NEXT:    v_mov_b32_e32 v5, v3
-; GFX942-NEXT:    v_mov_b32_e32 v4, v2
-; GFX942-NEXT:    global_load_dwordx3 v[0:2], v[6:7], off
-; GFX942-NEXT:    global_load_dword v3, v[4:5], off
+; GFX942-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX942-NEXT:    global_load_dword v4, v[2:3], off
+; GFX942-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
 ; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, v3
+; GFX942-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6bf16_452367:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index a401f989..d8264b5a 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -58,19 +58,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 2, v3
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 2, v3
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX942-NEXT:    global_load_dword v2, v1, s[0:1]
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v3
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dword v1, v2, s[2:3]
+; GFX942-NEXT:    global_load_dword v2, v1, s[2:3]
 ; GFX942-NEXT:  .LBB1_2: ; %bb.2
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX942-NEXT:    global_store_dword v0, v2, s[6:7]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -136,19 +136,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[0:1]
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[2:3]
 ; GFX942-NEXT:  .LBB3_2: ; %bb.2
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[6:7]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -173,19 +173,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 4, v6
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 4, v6
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v5, s[0:1]
+; GFX942-NEXT:    global_load_dwordx4 v[2:5], v1, s[0:1]
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v6
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
+; GFX942-NEXT:    global_load_dwordx4 v[2:5], v1, s[2:3]
 ; GFX942-NEXT:  .LBB4_2: ; %bb.2
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -210,23 +210,23 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-NEXT:    v_and_b32_e32 v10, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v9, 5, v10
-; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 5, v10
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v9, s[0:1] offset:16
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v9, s[0:1]
+; GFX942-NEXT:    global_load_dwordx4 v[6:9], v1, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[2:5], v1, s[0:1]
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v10
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v9, s[2:3] offset:16
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v9, s[2:3]
+; GFX942-NEXT:    global_load_dwordx4 v[6:9], v1, s[2:3] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[2:5], v1, s[2:3]
 ; GFX942-NEXT:  .LBB5_2: ; %bb.2
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7] offset:16
 ; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -250,72 +250,72 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
+; GFX942-NEXT:    v_and_b32_e32 v62, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v62
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx4 v[28:31], v1, s[0:1] offset:240
-; GFX942-NEXT:    global_load_dwordx4 v[24:27], v1, s[0:1] offset:224
-; GFX942-NEXT:    global_load_dwordx4 v[20:23], v1, s[0:1] offset:208
-; GFX942-NEXT:    global_load_dwordx4 v[16:19], v1, s[0:1] offset:192
-; GFX942-NEXT:    global_load_dwordx4 v[12:15], v1, s[0:1] offset:176
-; GFX942-NEXT:    global_load_dwordx4 v[8:11], v1, s[0:1] offset:160
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v1, s[0:1] offset:144
-; GFX942-NEXT:    global_load_dwordx4 a[0:3], v1, s[0:1] offset:128
-; GFX942-NEXT:    global_load_dwordx4 v[60:63], v1, s[0:1] offset:112
-; GFX942-NEXT:    global_load_dwordx4 v[56:59], v1, s[0:1] offset:96
-; GFX942-NEXT:    global_load_dwordx4 v[52:55], v1, s[0:1] offset:80
-; GFX942-NEXT:    global_load_dwordx4 v[48:51], v1, s[0:1] offset:64
-; GFX942-NEXT:    global_load_dwordx4 v[44:47], v1, s[0:1] offset:48
-; GFX942-NEXT:    global_load_dwordx4 v[40:43], v1, s[0:1] offset:32
-; GFX942-NEXT:    global_load_dwordx4 v[36:39], v1, s[0:1] offset:16
-; GFX942-NEXT:    global_load_dwordx4 v[32:35], v1, s[0:1]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT:    global_load_dwordx4 v[30:33], v1, s[0:1] offset:240
+; GFX942-NEXT:    global_load_dwordx4 v[26:29], v1, s[0:1] offset:224
+; GFX942-NEXT:    global_load_dwordx4 v[22:25], v1, s[0:1] offset:208
+; GFX942-NEXT:    global_load_dwordx4 v[18:21], v1, s[0:1] offset:192
+; GFX942-NEXT:    global_load_dwordx4 v[14:17], v1, s[0:1] offset:176
+; GFX942-NEXT:    global_load_dwordx4 v[10:13], v1, s[0:1] offset:160
+; GFX942-NEXT:    global_load_dwordx4 v[6:9], v1, s[0:1] offset:144
+; GFX942-NEXT:    global_load_dwordx4 v[2:5], v1, s[0:1] offset:128
+; GFX942-NEXT:    global_load_dwordx4 a[0:3], v1, s[0:1] offset:112
+; GFX942-NEXT:    global_load_dwordx4 v[58:61], v1, s[0:1] offset:96
+; GFX942-NEXT:    global_load_dwordx4 v[54:57], v1, s[0:1] offset:80
+; GFX942-NEXT:    global_load_dwordx4 v[50:53], v1, s[0:1] offset:64
+; GFX942-NEXT:    global_load_dwordx4 v[46:49], v1, s[0:1] offset:48
+; GFX942-NEXT:    global_load_dwordx4 v[42:45], v1, s[0:1] offset:32
+; GFX942-NEXT:    global_load_dwordx4 v[38:41], v1, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[34:37], v1, s[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v62
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx4 v[28:31], v1, s[2:3] offset:240
-; GFX942-NEXT:    global_load_dwordx4 v[24:27], v1, s[2:3] offset:224
-; GFX942-NEXT:    global_load_dwordx4 v[20:23], v1, s[2:3] offset:208
-; GFX942-NEXT:    global_load_dwordx4 v[16:19], v1, s[2:3] offset:192
-; GFX942-NEXT:    global_load_dwordx4 v[12:15], v1, s[2:3] offset:176
-; GFX942-NEXT:    global_load_dwordx4 v[8:11], v1, s[2:3] offset:160
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3] offset:144
-; GFX942-NEXT:    global_load_dwordx4 a[0:3], v1, s[2:3] offset:128
-; GFX942-NEXT:    global_load_dwordx4 v[60:63], v1, s[2:3] offset:112
-; GFX942-NEXT:    global_load_dwordx4 v[56:59], v1, s[2:3] offset:96
-; GFX942-NEXT:    global_load_dwordx4 v[52:55], v1, s[2:3] offset:80
-; GFX942-NEXT:    global_load_dwordx4 v[48:51], v1, s[2:3] offset:64
-; GFX942-NEXT:    global_load_dwordx4 v[44:47], v1, s[2:3] offset:48
-; GFX942-NEXT:    global_load_dwordx4 v[40:43], v1, s[2:3] offset:32
-; GFX942-NEXT:    global_load_dwordx4 v[36:39], v1, s[2:3] offset:16
-; GFX942-NEXT:    global_load_dwordx4 v[32:35], v1, s[2:3]
+; GFX942-NEXT:    global_load_dwordx4 v[30:33], v1, s[2:3] offset:240
+; GFX942-NEXT:    global_load_dwordx4 v[26:29], v1, s[2:3] offset:224
+; GFX942-NEXT:    global_load_dwordx4 v[22:25], v1, s[2:3] offset:208
+; GFX942-NEXT:    global_load_dwordx4 v[18:21], v1, s[2:3] offset:192
+; GFX942-NEXT:    global_load_dwordx4 v[14:17], v1, s[2:3] offset:176
+; GFX942-NEXT:    global_load_dwordx4 v[10:13], v1, s[2:3] offset:160
+; GFX942-NEXT:    global_load_dwordx4 v[6:9], v1, s[2:3] offset:144
+; GFX942-NEXT:    global_load_dwordx4 v[2:5], v1, s[2:3] offset:128
+; GFX942-NEXT:    global_load_dwordx4 a[0:3], v1, s[2:3] offset:112
+; GFX942-NEXT:    global_load_dwordx4 v[58:61], v1, s[2:3] offset:96
+; GFX942-NEXT:    global_load_dwordx4 v[54:57], v1, s[2:3] offset:80
+; GFX942-NEXT:    global_load_dwordx4 v[50:53], v1, s[2:3] offset:64
+; GFX942-NEXT:    global_load_dwordx4 v[46:49], v1, s[2:3] offset:48
+; GFX942-NEXT:    global_load_dwordx4 v[42:45], v1, s[2:3] offset:32
+; GFX942-NEXT:    global_load_dwordx4 v[38:41], v1, s[2:3] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[34:37], v1, s[2:3]
 ; GFX942-NEXT:  .LBB6_2: ; %bb.2
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[60:63], s[6:7] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7] offset:112
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[56:59], s[6:7] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v0, v[58:61], s[6:7] offset:96
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[52:55], s[6:7] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v0, v[54:57], s[6:7] offset:80
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[48:51], s[6:7] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v0, v[50:53], s[6:7] offset:64
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[44:47], s[6:7] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, v[46:49], s[6:7] offset:48
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[40:43], s[6:7] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, v[42:45], s[6:7] offset:32
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[36:39], s[6:7] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, v[38:41], s[6:7] offset:16
 ; GFX942-NEXT:    s_waitcnt vmcnt(7)
-; GFX942-NEXT:    global_store_dwordx4 v0, v[32:35], s[6:7]
-; GFX942-NEXT:    global_store_dwordx4 v0, v[28:31], s[6:7] offset:240
-; GFX942-NEXT:    global_store_dwordx4 v0, v[24:27], s[6:7] offset:224
-; GFX942-NEXT:    global_store_dwordx4 v0, v[20:23], s[6:7] offset:208
-; GFX942-NEXT:    global_store_dwordx4 v0, v[16:19], s[6:7] offset:192
-; GFX942-NEXT:    global_store_dwordx4 v0, v[12:15], s[6:7] offset:176
-; GFX942-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7] offset:160
-; GFX942-NEXT:    global_store_dwordx4 v0, v[4:7], s[6:7] offset:144
-; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7] offset:128
+; GFX942-NEXT:    global_store_dwordx4 v0, v[34:37], s[6:7]
+; GFX942-NEXT:    global_store_dwordx4 v0, v[30:33], s[6:7] offset:240
+; GFX942-NEXT:    global_store_dwordx4 v0, v[26:29], s[6:7] offset:224
+; GFX942-NEXT:    global_store_dwordx4 v0, v[22:25], s[6:7] offset:208
+; GFX942-NEXT:    global_store_dwordx4 v0, v[18:21], s[6:7] offset:192
+; GFX942-NEXT:    global_store_dwordx4 v0, v[14:17], s[6:7] offset:176
+; GFX942-NEXT:    global_store_dwordx4 v0, v[10:13], s[6:7] offset:160
+; GFX942-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7] offset:144
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7] offset:128
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -391,17 +391,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-LABEL: v8i8_phi_chain:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v2
-; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v2
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v2
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[8:9]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v2
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[10:11]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
@@ -410,14 +410,14 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX942-NEXT:  ; %bb.3: ; %bb.2
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[12:13]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[12:13]
 ; GFX942-NEXT:  .LBB8_4: ; %bb.3
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[14:15]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -447,38 +447,38 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa
 ; GFX942-LABEL: v8i8_phi_zeroinit:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 3, v4
-; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v4
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX942-NEXT:    v_cmp_lt_u32_e64 s[0:1], 14, v0
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v5, s[8:9]
-; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[8:9]
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[2:3], v5, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v4
+; GFX942-NEXT:    global_load_dwordx2 v[4:5], v1, s[10:11]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX942-NEXT:    s_waitcnt vmcnt(1)
-; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX942-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX942-NEXT:  .LBB9_2: ; %Flow
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX942-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX942-NEXT:  ; %bb.3: ; %bb.2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
-; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[12:13]
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[12:13]
 ; GFX942-NEXT:  .LBB9_4: ; %bb.3
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[14:15]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[4:5], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -617,30 +617,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
 ; GFX942-LABEL: v8i8_multi_block:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-NEXT:    v_and_b32_e32 v5, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v6, 3, v5
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v5
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 3, v3
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v3
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[2:3], v6, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v4, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v6, s[10:11]
-; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v5
+; GFX942-NEXT:    global_load_dwordx2 v[6:7], v4, s[10:11]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v3
 ; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX942-NEXT:  ; %bb.2: ; %bb.2
-; GFX942-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-NEXT:    global_store_dwordx2 v5, v[2:3], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    global_store_dwordx2 v3, v[0:1], s[12:13]
 ; GFX942-NEXT:  .LBB11_3: ; %Flow
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:  .LBB11_4: ; %bb.3
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[14:15]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[6:7], s[14:15]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -859,15 +859,15 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[8:9]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[8:9]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[10:11]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[10:11]
 ; GFX942-NEXT:  .LBB14_2: ; %bb.2
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[14:15], 0x0
@@ -878,9 +878,9 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, s3
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 6
-; GFX942-NEXT:    global_store_dwordx4 v2, a[0:3], s[12:13]
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -909,15 +909,15 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx8 s[36:43], s[4:5], 0x24
 ; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v4
-; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[36:37]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[36:37]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX942-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX942-NEXT:  ; %bb.1: ; %bb.1
-; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[38:39]
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[38:39]
 ; GFX942-NEXT:  .LBB15_2: ; %bb.2
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_load_dwordx16 s[16:31], s[42:43], 0x0
@@ -957,18 +957,18 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, s15
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 2
-; GFX942-NEXT:    global_store_dwordx4 v2, a[28:31], s[40:41] offset:112
-; GFX942-NEXT:    global_store_dwordx4 v2, a[24:27], s[40:41] offset:96
-; GFX942-NEXT:    global_store_dwordx4 v2, a[20:23], s[40:41] offset:80
-; GFX942-NEXT:    global_store_dwordx4 v2, a[16:19], s[40:41] offset:64
-; GFX942-NEXT:    global_store_dwordx4 v2, a[12:15], s[40:41] offset:48
-; GFX942-NEXT:    global_store_dwordx4 v2, a[8:11], s[40:41] offset:32
-; GFX942-NEXT:    global_store_dwordx4 v2, a[4:7], s[40:41] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v2, a[0:3], s[40:41]
+; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[40:41] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v0, a[16:19], s[40:41] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[40:41] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[40:41] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[40:41] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[40:41]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
index 145f1e4..ff18b32 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-regalloc-error.ll
@@ -2,7 +2,7 @@
 
 ; A negative test to capture the expected error when the VGPRs are insufficient for wwm-regalloc.
 
-; CHECK: error: can't find enough VGPRs for wwm-regalloc
+; CHECK: error: cannot find enough VGPRs for wwm-regalloc
 
 define amdgpu_kernel void @test(i32 %in) {
 entry:
diff --git a/llvm/test/CodeGen/ARM/fp16.ll b/llvm/test/CodeGen/ARM/fp16.ll
index dc35fa3..9ff7010 100644
--- a/llvm/test/CodeGen/ARM/fp16.ll
+++ b/llvm/test/CodeGen/ARM/fp16.ll
@@ -86,8 +86,8 @@ define i16 @test_to_fp16(double %in) {
 
 ; CHECK-FP16-SAFE: bl __aeabi_d2h
 
-; CHECK-FP16-UNSAFE:      vcvt.f32.f64 s0, d0
-; CHECK-FP16-UNSAFE-NEXT: vcvtb.f16.f32 s0, s0
+; CHECK-FP16-UNSAFE:      vmov r0, r1, d0
+; CHECK-FP16-UNSAFE-NEXT: bl __aeabi_d2h
 
 ; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0
 ; CHECK-ARMV8: vmov r0, [[TMP]]
diff --git a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
index f3a227c..2fc6790 100644
--- a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
+++ b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
@@ -22,3 +22,11 @@ define void @test() {
 define void @test_optsize() optsize {
   ret void
 }
+
+; CHECK-LABEL: test_minsize
+; ALIGN-CS-16: .p2align 1
+; ALIGN-CS-32: .p2align 2
+
+define void @test_minsize() minsize {
+  ret void
+}
diff --git a/llvm/test/CodeGen/BPF/BTF/map-def-2.ll b/llvm/test/CodeGen/BPF/BTF/map-def-2.ll
index 5f971ec..d4c836f 100644
--- a/llvm/test/CodeGen/BPF/BTF/map-def-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/map-def-2.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
-; RUN: llc -mtriple=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=bpfel -mcpu=v3 -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
 ;
 ; Source code:
 ;   struct key_type {
@@ -18,51 +19,17 @@
 
 @hash_map = dso_local local_unnamed_addr global %struct.map_type zeroinitializer, section ".maps", align 8, !dbg !0
 
-; CHECK:             .long   0                               # BTF_KIND_PTR(id = 1)
-; CHECK-NEXT:        .long   33554432                        # 0x2000000
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   1                               # BTF_KIND_STRUCT(id = 2)
-; CHECK-NEXT:        .long   67108865                        # 0x4000001
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   10
-; CHECK-NEXT:        .long   3
-; CHECK-NEXT:        .long   0                               # 0x0
-; CHECK-NEXT:        .long   13                              # BTF_KIND_INT(id = 3)
-; CHECK-NEXT:        .long   16777216                        # 0x1000000
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   16777248                        # 0x1000020
-; CHECK-NEXT:        .long   17                              # BTF_KIND_TYPEDEF(id = 4)
-; CHECK-NEXT:        .long   134217728                       # 0x8000000
-; CHECK-NEXT:        .long   5
-; CHECK-NEXT:        .long   28                              # BTF_KIND_TYPEDEF(id = 5)
-; CHECK-NEXT:        .long   134217728                       # 0x8000000
-; CHECK-NEXT:        .long   6
-; CHECK-NEXT:        .long   38                              # BTF_KIND_STRUCT(id = 6)
-; CHECK-NEXT:        .long   67108865                        # 0x4000001
-; CHECK-NEXT:        .long   8
-; CHECK-NEXT:        .long   47
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   0                               # 0x0
-; CHECK-NEXT:        .long   51                              # BTF_KIND_VAR(id = 7)
-; CHECK-NEXT:        .long   234881024                       # 0xe000000
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   60                              # BTF_KIND_DATASEC(id = 8)
-; CHECK-NEXT:        .long   251658241                       # 0xf000001
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   7
-; CHECK-NEXT:        .long   hash_map
-; CHECK-NEXT:        .long   8
-
-; CHECK:             .ascii  "key_type"                      # string offset=1
-; CHECK:             .ascii  "a1"                            # string offset=10
-; CHECK:             .ascii  "int"                           # string offset=13
-; CHECK:             .ascii  "__map_type"                    # string offset=17
-; CHECK:             .ascii  "_map_type"                     # string offset=28
-; CHECK:             .ascii  "map_type"                      # string offset=38
-; CHECK:             .ascii  "key"                           # string offset=47
-; CHECK:             .ascii  "hash_map"                      # string offset=51
-; CHECK:             .ascii  ".maps"                         # string offset=60
+; CHECK-BTF: [1] PTR '(anon)' type_id=2
+; CHECK-BTF-NEXT: [2] STRUCT 'key_type' size=4 vlen=1
+; CHECK-BTF-NEXT:         'a1' type_id=3 bits_offset=0
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] STRUCT 'map_type' size=8 vlen=1
+; CHECK-BTF-NEXT:         'key' type_id=1 bits_offset=0
+; CHECK-BTF-NEXT: [5] TYPEDEF '_map_type' type_id=4
+; CHECK-BTF-NEXT: [6] TYPEDEF '__map_type' type_id=5
+; CHECK-BTF-NEXT: [7] VAR 'hash_map' type_id=6, linkage=global
+; CHECK-BTF-NEXT: [8] DATASEC '.maps' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=7 offset=0 size=8
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/BTF/map-def-3.ll b/llvm/test/CodeGen/BPF/BTF/map-def-3.ll
index 6aa8af9..1d95f03 100644
--- a/llvm/test/CodeGen/BPF/BTF/map-def-3.ll
+++ b/llvm/test/CodeGen/BPF/BTF/map-def-3.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
-; RUN: llc -mtriple=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=bpfel -mcpu=v3 -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
 ;
 ; Source code:
 ;   struct key_type {
@@ -13,36 +14,13 @@
 
 @hash_map = dso_local local_unnamed_addr constant %struct.key_type zeroinitializer, section ".maps", align 4, !dbg !0
 
-; CHECK:             .long   1                               # BTF_KIND_INT(id = 1)
-; CHECK-NEXT:        .long   16777216                        # 0x1000000
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   16777248                        # 0x1000020
-; CHECK-NEXT:        .long   0                               # BTF_KIND_CONST(id = 2)
-; CHECK-NEXT:        .long   167772160                       # 0xa000000
-; CHECK-NEXT:        .long   3
-; CHECK-NEXT:        .long   5                               # BTF_KIND_STRUCT(id = 3)
-; CHECK-NEXT:        .long   67108865                        # 0x4000001
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   14
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   0                               # 0x0
-; CHECK-NEXT:        .long   17                              # BTF_KIND_VAR(id = 4)
-; CHECK-NEXT:        .long   234881024                       # 0xe000000
-; CHECK-NEXT:        .long   2
-; CHECK-NEXT:        .long   1
-; CHECK-NEXT:        .long   26                              # BTF_KIND_DATASEC(id = 5)
-; CHECK-NEXT:        .long   251658241                       # 0xf000001
-; CHECK-NEXT:        .long   0
-; CHECK-NEXT:        .long   4
-; CHECK-NEXT:        .long   hash_map
-; CHECK-NEXT:        .long   4
-
-; CHECK:             .ascii  "int"                           # string offset=1
-; CHECK:             .ascii  "key_type"                      # string offset=5
-; CHECK:             .ascii  "a1"                            # string offset=14
-; CHECK:             .ascii  "hash_map"                      # string offset=17
-; CHECK:             .ascii  ".maps"                         # string offset=26
-
+; CHECK-BTF: [1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [2] STRUCT 'key_type' size=4 vlen=1
+; CHECK-BTF-NEXT:         'a1' type_id=1 bits_offset=0
+; CHECK-BTF-NEXT: [3] CONST '(anon)' type_id=2
+; CHECK-BTF-NEXT: [4] VAR 'hash_map' type_id=3, linkage=global
+; CHECK-BTF-NEXT: [5] DATASEC '.maps' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=4 offset=0 size=4
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/BTF/map-def-nested-array.ll b/llvm/test/CodeGen/BPF/BTF/map-def-nested-array.ll
new file mode 100644
index 0000000..fc95daf
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/map-def-nested-array.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=bpfel -mcpu=v3 -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF-SHORT %s
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; Source:
+;  struct nested_value_type {
+;  	int a1;
+;  };
+;  struct map_type {
+;  	struct {
+;  		struct nested_value_type *value;
+;  	} *values[];
+;  };
+; Compilation flags:
+;   clang -target bpf -g -O2 -S -emit-llvm prog.c
+
+; ModuleID = 'prog.c'
+source_filename = "prog.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpf"
+
+%struct.map_type = type { [0 x ptr] }
+
+@array_of_maps = dso_local local_unnamed_addr global %struct.map_type zeroinitializer, section ".maps", align 8, !dbg !0
+
+; We expect no forward declarations.
+;
+; CHECK-BTF-SHORT-NOT: FWD
+
+; Assert the whole BTF.
+;
+; CHECK-BTF: [1] PTR '(anon)' type_id=2
+; CHECK-BTF-NEXT: [2] STRUCT 'nested_value_type' size=4 vlen=1
+; CHECK-BTF-NEXT:         'a1' type_id=3 bits_offset=0
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] STRUCT '(anon)' size=8 vlen=1
+; CHECK-BTF-NEXT:         'value' type_id=1 bits_offset=0
+; CHECK-BTF-NEXT: [5] PTR '(anon)' type_id=4
+; CHECK-BTF-NEXT: [6] ARRAY '(anon)' type_id=5 index_type_id=7 nr_elems=0
+; CHECK-BTF-NEXT: [7] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [8] STRUCT 'map_type' size=0 vlen=1
+; CHECK-BTF-NEXT:         'values' type_id=6 bits_offset=0
+; CHECK-BTF-NEXT: [9] VAR 'array_of_maps' type_id=8, linkage=global
+; CHECK-BTF-NEXT: [10] DATASEC '.maps' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=9 offset=0 size=0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!20, !21, !22, !23}
+!llvm.ident = !{!24}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "array_of_maps", scope: !2, file: !3, line: 9, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 22.0.0git (git@github.com:llvm/llvm-project.git ed93eaa421b714028b85cc887d80c45991d7207f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "prog.c", directory: "/home/mtardy/llvm-bug-repro", checksumkind: CSK_MD5, checksum: "9381d9e83e9c0b235a14704224815e96")
+!4 = !{!0}
+!5 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "map_type", file: !3, line: 4, elements: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "values", scope: !5, file: !3, line: 7, baseType: !8)
+!8 = !DICompositeType(tag: DW_TAG_array_type, baseType: !9, elements: !18)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, scope: !5, file: !3, line: 5, size: 64, elements: !11)
+!11 = !{!12}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !10, file: !3, line: 6, baseType: !13, size: 64)
+!13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "nested_value_type", file: !3, line: 1, size: 32, elements: !15)
+!15 = !{!16}
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "a1", scope: !14, file: !3, line: 2, baseType: !17, size: 32)
+!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!18 = !{!19}
+!19 = !DISubrange(count: -1)
+!20 = !{i32 7, !"Dwarf Version", i32 5}
+!21 = !{i32 2, !"Debug Info Version", i32 3}
+!22 = !{i32 1, !"wchar_size", i32 4}
+!23 = !{i32 7, !"frame-pointer", i32 2}
+!24 = !{!"clang version 22.0.0git (git@github.com:llvm/llvm-project.git ed93eaa421b714028b85cc887d80c45991d7207f)"}
diff --git a/llvm/test/CodeGen/DirectX/Binding/binding-overlap-6.ll b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-6.ll
new file mode 100644
index 0000000..3c37e63
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-6.ll
@@ -0,0 +1,24 @@
+; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; Check overlap with unbounded array
+
+; A overlaps with B
+; RWBuffer<float> A[3] : register(u0);
+; RWBuffer<float> B[] : register(u4);
+; RWBuffer<float> C : register(u17);
+
+; CHECK: error: resource B at register 4 overlaps with resource C at register 17 in space 0
+
+target triple = "dxil-pc-shadermodel6.3-library"
+
+@A.str = private unnamed_addr constant [2 x i8] c"A\00", align 1
+@B.str = private unnamed_addr constant [2 x i8] c"B\00", align 1
+@C.str = private unnamed_addr constant [2 x i8] c"C\00", align 1
+
+define void @test_overlapping() {
+entry:
+  %h1 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 3, i32 0, i1 false, ptr @A.str)
+  %h2 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 4, i32 -1, i32 0, i1 false, ptr @B.str)
+  %h3 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 17, i32 1, i32 0, i1 false, ptr @C.str)
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
index 16cc1f3..e5a6aa4 100644
--- a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll
@@ -183,7 +183,7 @@ b0:
   %v11 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v10, <32 x i32> undef)
   %v12 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v11, i32 2147483647, i32 1)
   store <64 x i32> %v12, ptr @g0, align 128
-  call void (ptr, ...) @f1(ptr @g3) #2
+  call void (ptr, ...) @f1(ptr @g3) #3
   %v13 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2)
   %v14 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v13)
   %v15 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v14, i32 -2147483648, i32 1)
@@ -193,7 +193,7 @@ b0:
   %v17 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v16)
   %v18 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v17, i32 0, i32 1)
   store <64 x i32> %v18, ptr @g0, align 128
-  call void @f0() #2
+  call void @f0() #3
   %v19 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1)
   %v20 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2)
   %v21 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v19, <32 x i32> %v20)
@@ -205,3 +205,4 @@ b0:
 attributes #0 = { nounwind "use-soft-float"="false" "target-cpu"="hexagonv66" "target-features"="+hvxv66,+hvx-length128b" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind optsize }
+attributes #3 = { nounwind minsize }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index 086ef54..c18c637 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -355,40 +355,46 @@ entry:
 define void @buildvector_v32i8_partial(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a5, i8 %a7, i8 %a8, i8 %a15, i8 %a17, i8 %a18, i8 %a20, i8 %a22, i8 %a23, i8 %a27, i8 %a28, i8 %a31) nounwind {
 ; CHECK-LABEL: buildvector_v32i8_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    ld.b $t0, $fp, 0
-; CHECK-NEXT:    ld.b $t1, $fp, 8
-; CHECK-NEXT:    ld.b $t2, $fp, 16
-; CHECK-NEXT:    ld.b $t3, $fp, 24
-; CHECK-NEXT:    ld.b $t4, $fp, 56
-; CHECK-NEXT:    ld.b $t5, $fp, 48
-; CHECK-NEXT:    ld.b $t6, $fp, 40
-; CHECK-NEXT:    ld.b $t7, $fp, 32
-; CHECK-NEXT:    st.b $t4, $sp, 63
-; CHECK-NEXT:    st.b $t5, $sp, 60
-; CHECK-NEXT:    st.b $t6, $sp, 59
-; CHECK-NEXT:    st.b $t7, $sp, 55
-; CHECK-NEXT:    st.b $t3, $sp, 54
-; CHECK-NEXT:    st.b $t2, $sp, 52
-; CHECK-NEXT:    st.b $t1, $sp, 50
-; CHECK-NEXT:    st.b $t0, $sp, 49
-; CHECK-NEXT:    st.b $a7, $sp, 47
-; CHECK-NEXT:    st.b $a6, $sp, 40
-; CHECK-NEXT:    st.b $a5, $sp, 39
-; CHECK-NEXT:    st.b $a4, $sp, 37
-; CHECK-NEXT:    st.b $a3, $sp, 34
-; CHECK-NEXT:    st.b $a2, $sp, 33
-; CHECK-NEXT:    st.b $a1, $sp, 32
-; CHECK-NEXT:    xvld $xr0, $sp, 32
-; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ld.b $t0, $sp, 56
+; CHECK-NEXT:    ld.b $t1, $sp, 48
+; CHECK-NEXT:    ld.b $t2, $sp, 40
+; CHECK-NEXT:    ld.b $t3, $sp, 32
+; CHECK-NEXT:    ld.b $t4, $sp, 24
+; CHECK-NEXT:    ld.b $t5, $sp, 16
+; CHECK-NEXT:    ld.b $t6, $sp, 8
+; CHECK-NEXT:    ld.b $t7, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 1
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 2
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a4, 5
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a5, 7
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a6, 8
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a7, 15
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t7, 1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t6, 2
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t5, 4
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t4, 6
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t3, 7
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t2, 11
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t1, 12
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t0, 15
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %ins0  = insertelement <32 x i8> undef,  i8   %a0,  i32 0
@@ -427,6 +433,88 @@ entry:
   ret void
 }
 
+define void @buildvector_v32i8_with_constant(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a5, i8 %a8, i8 %a9, i8 %a15, i8 %a17, i8 %a18, i8 %a20, i8 %a22, i8 %a23, i8 %a27, i8 %a28, i8 %a31) nounwind {
+; CHECK-LABEL: buildvector_v32i8_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    ld.b $t0, $sp, 56
+; CHECK-NEXT:    ld.b $t1, $sp, 48
+; CHECK-NEXT:    ld.b $t2, $sp, 40
+; CHECK-NEXT:    ld.b $t3, $sp, 32
+; CHECK-NEXT:    ld.b $t4, $sp, 24
+; CHECK-NEXT:    ld.b $t5, $sp, 16
+; CHECK-NEXT:    ld.b $t6, $sp, 8
+; CHECK-NEXT:    ld.b $t7, $sp, 0
+; CHECK-NEXT:    xvrepli.b $xr0, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 1
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 2
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a4, 5
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a5, 8
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a6, 9
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a7, 15
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t7, 1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t6, 2
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t5, 4
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t4, 6
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t3, 7
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t2, 11
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t1, 12
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.b $vr1, $t0, 15
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0  = insertelement <32 x i8> undef,  i8   %a0,  i32 0
+  %ins1  = insertelement <32 x i8> %ins0,  i8   %a1,  i32 1
+  %ins2  = insertelement <32 x i8> %ins1,  i8   %a2,  i32 2
+  %ins3  = insertelement <32 x i8> %ins2,  i8     0,  i32 3
+  %ins4  = insertelement <32 x i8> %ins3,  i8     0,  i32 4
+  %ins5  = insertelement <32 x i8> %ins4,  i8   %a5,  i32 5
+  %ins6  = insertelement <32 x i8> %ins5,  i8 undef,  i32 6
+  %ins7  = insertelement <32 x i8> %ins6,  i8     0,  i32 7
+  %ins8  = insertelement <32 x i8> %ins7,  i8   %a8,  i32 8
+  %ins9  = insertelement <32 x i8> %ins8,  i8   %a9,  i32 9
+  %ins10 = insertelement <32 x i8> %ins9,  i8     0, i32 10
+  %ins11 = insertelement <32 x i8> %ins10, i8 undef, i32 11
+  %ins12 = insertelement <32 x i8> %ins11, i8     0, i32 12
+  %ins13 = insertelement <32 x i8> %ins12, i8     0, i32 13
+  %ins14 = insertelement <32 x i8> %ins13, i8 undef, i32 14
+  %ins15 = insertelement <32 x i8> %ins14, i8  %a15, i32 15
+  %ins16 = insertelement <32 x i8> %ins15, i8     0, i32 16
+  %ins17 = insertelement <32 x i8> %ins16, i8  %a17, i32 17
+  %ins18 = insertelement <32 x i8> %ins17, i8  %a18, i32 18
+  %ins19 = insertelement <32 x i8> %ins18, i8     0, i32 19
+  %ins20 = insertelement <32 x i8> %ins19, i8  %a20, i32 20
+  %ins21 = insertelement <32 x i8> %ins20, i8     0, i32 21
+  %ins22 = insertelement <32 x i8> %ins21, i8  %a22, i32 22
+  %ins23 = insertelement <32 x i8> %ins22, i8  %a23, i32 23
+  %ins24 = insertelement <32 x i8> %ins23, i8     0, i32 24
+  %ins25 = insertelement <32 x i8> %ins24, i8 undef, i32 25
+  %ins26 = insertelement <32 x i8> %ins25, i8 undef, i32 26
+  %ins27 = insertelement <32 x i8> %ins26, i8  %a27, i32 27
+  %ins28 = insertelement <32 x i8> %ins27, i8  %a28, i32 28
+  %ins29 = insertelement <32 x i8> %ins28, i8     0, i32 29
+  %ins30 = insertelement <32 x i8> %ins29, i8 undef, i32 30
+  %ins31 = insertelement <32 x i8> %ins30, i8  %a31, i32 31
+  store <32 x i8> %ins31, ptr %dst
+  ret void
+}
+
 define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
 ; CHECK-LABEL: buildvector_v16i16:
 ; CHECK:       # %bb.0: # %entry
@@ -497,24 +585,18 @@ entry:
 define void @buildvector_v16i16_partial(ptr %dst, i16 %a0, i16 %a2, i16 %a5, i16 %a6, i16 %a7, i16 %a12, i16 %a13) nounwind {
 ; CHECK-LABEL: buildvector_v16i16_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    st.h $a7, $sp, 58
-; CHECK-NEXT:    st.h $a6, $sp, 56
-; CHECK-NEXT:    st.h $a5, $sp, 46
-; CHECK-NEXT:    st.h $a4, $sp, 44
-; CHECK-NEXT:    st.h $a3, $sp, 42
-; CHECK-NEXT:    st.h $a2, $sp, 36
-; CHECK-NEXT:    st.h $a1, $sp, 32
-; CHECK-NEXT:    xvld $xr0, $sp, 32
-; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 2
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 5
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a4, 6
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a5, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a6, 4
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a7, 5
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %ins0  = insertelement <16 x i16> undef,  i16   %a0,  i32 0
@@ -537,6 +619,44 @@ entry:
   ret void
 }
 
+define void @buildvector_v16i16_with_constant(ptr %dst, i16 %a2, i16 %a3, i16 %a5, i16 %a6, i16 %a7, i16 %a12, i16 %a13) nounwind {
+; CHECK-LABEL: buildvector_v16i16_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvrepli.h $xr0, 2
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 3
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 5
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a4, 6
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a5, 7
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a6, 4
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a7, 5
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0  = insertelement <16 x i16> undef,  i16    2,  i32 0
+  %ins1  = insertelement <16 x i16> %ins0,  i16    2,  i32 1
+  %ins2  = insertelement <16 x i16> %ins1,  i16  %a2,  i32 2
+  %ins3  = insertelement <16 x i16> %ins2,  i16  %a3,  i32 3
+  %ins4  = insertelement <16 x i16> %ins3,  i16    2,  i32 4
+  %ins5  = insertelement <16 x i16> %ins4,  i16  %a5,  i32 5
+  %ins6  = insertelement <16 x i16> %ins5,  i16  %a6,  i32 6
+  %ins7  = insertelement <16 x i16> %ins6,  i16  %a7,  i32 7
+  %ins8  = insertelement <16 x i16> %ins7,  i16    2,  i32 8
+  %ins9  = insertelement <16 x i16> %ins8,  i16    2,  i32 9
+  %ins10 = insertelement <16 x i16> %ins9,  i16    2, i32 10
+  %ins11 = insertelement <16 x i16> %ins10, i16    2, i32 11
+  %ins12 = insertelement <16 x i16> %ins11, i16 %a12, i32 12
+  %ins13 = insertelement <16 x i16> %ins12, i16 %a13, i32 13
+  %ins14 = insertelement <16 x i16> %ins13, i16    2, i32 14
+  %ins15 = insertelement <16 x i16> %ins14, i16    2, i32 15
+  store <16 x i16> %ins15, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8i32:
 ; CHECK:       # %bb.0: # %entry
@@ -567,21 +687,11 @@ entry:
 define void @buildvector_v8i32_partial(ptr %dst, i32 %a2, i32 %a4, i32 %a5, i32 %a6) nounwind {
 ; CHECK-LABEL: buildvector_v8i32_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    st.w $a4, $sp, 56
-; CHECK-NEXT:    st.w $a3, $sp, 52
-; CHECK-NEXT:    st.w $a2, $sp, 48
-; CHECK-NEXT:    st.w $a1, $sp, 40
-; CHECK-NEXT:    xvld $xr0, $sp, 32
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 2
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a2, 4
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a3, 5
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a4, 6
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %ins0 = insertelement <8 x i32> undef, i32 undef, i32 0
@@ -596,6 +706,29 @@ entry:
   ret void
 }
 
+define void @buildvector_v8i32_with_constant(ptr %dst, i32 %a2, i32 %a4, i32 %a5, i32 %a6) nounwind {
+; CHECK-LABEL: buildvector_v8i32_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvrepli.b $xr0, 0
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 2
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a2, 4
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a3, 5
+; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a4, 6
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <8 x i32> undef, i32   0, i32 0
+  %ins1 = insertelement <8 x i32> %ins0, i32   0, i32 1
+  %ins2 = insertelement <8 x i32> %ins1, i32 %a2, i32 2
+  %ins3 = insertelement <8 x i32> %ins2, i32   0, i32 3
+  %ins4 = insertelement <8 x i32> %ins3, i32 %a4, i32 4
+  %ins5 = insertelement <8 x i32> %ins4, i32 %a5, i32 5
+  %ins6 = insertelement <8 x i32> %ins5, i32 %a6, i32 6
+  %ins7 = insertelement <8 x i32> %ins6, i32   0, i32 7
+  store <8 x i32> %ins7, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4i64(ptr %dst, i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4i64:
 ; CHECK:       # %bb.0: # %entry
@@ -617,11 +750,8 @@ entry:
 define void @buildvector_v4i64_partial(ptr %dst, i64 %a1, i64 %a2) nounwind {
 ; CHECK-LABEL: buildvector_v4i64_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a2, 0
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
-; CHECK-NEXT:    xvinsgr2vr.d $xr1, $a1, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
-; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 1
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a2, 2
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -633,6 +763,23 @@ entry:
   ret void
 }
 
+define void @buildvector_v4i64_with_constant(ptr %dst, i64 %a0, i64 %a2) nounwind {
+; CHECK-LABEL: buildvector_v4i64_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvrepli.b $xr0, 0
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 0
+; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a2, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x i64> undef, i64 %a0, i32 0
+  %ins1 = insertelement <4 x i64> %ins0, i64   0, i32 1
+  %ins2 = insertelement <4 x i64> %ins1, i64 %a2, i32 2
+  %ins3 = insertelement <4 x i64> %ins2, i64   0, i32 3
+  store <4 x i64> %ins3, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8f32:
 ; CHECK:       # %bb.0: # %entry
@@ -669,21 +816,15 @@ entry:
 define void @buildvector_v8f32_partial(ptr %dst, float %a1, float %a2, float %a5, float %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8f32_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 96
-; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    fst.s $fa3, $sp, 60
-; CHECK-NEXT:    fst.s $fa2, $sp, 52
-; CHECK-NEXT:    fst.s $fa1, $sp, 40
-; CHECK-NEXT:    fst.s $fa0, $sp, 36
-; CHECK-NEXT:    xvld $xr0, $sp, 32
+; CHECK-NEXT:    # kill: def $f3 killed $f3 def $xr3
+; CHECK-NEXT:    # kill: def $f2 killed $f2 def $xr2
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $xr1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr0, 1
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 2
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr2, 5
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr3, 7
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -96
-; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %ins0 = insertelement <8 x float> undef, float undef, i32 0
@@ -698,6 +839,34 @@ entry:
   ret void
 }
 
+define void @buildvector_v8f32_with_constant(ptr %dst, float %a1, float %a2, float %a5, float %a7) nounwind {
+; CHECK-LABEL: buildvector_v8f32_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f3 killed $f3 def $xr3
+; CHECK-NEXT:    # kill: def $f2 killed $f2 def $xr2
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $xr1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    lu12i.w $a1, 262144
+; CHECK-NEXT:    xvreplgr2vr.w $xr4, $a1
+; CHECK-NEXT:    xvinsve0.w $xr4, $xr0, 1
+; CHECK-NEXT:    xvinsve0.w $xr4, $xr1, 2
+; CHECK-NEXT:    xvinsve0.w $xr4, $xr2, 5
+; CHECK-NEXT:    xvinsve0.w $xr4, $xr3, 7
+; CHECK-NEXT:    xvst $xr4, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <8 x float> undef, float 2.0, i32 0
+  %ins1 = insertelement <8 x float> %ins0, float %a1, i32 1
+  %ins2 = insertelement <8 x float> %ins1, float %a2, i32 2
+  %ins3 = insertelement <8 x float> %ins2, float 2.0, i32 3
+  %ins4 = insertelement <8 x float> %ins3, float 2.0, i32 4
+  %ins5 = insertelement <8 x float> %ins4, float %a5, i32 5
+  %ins6 = insertelement <8 x float> %ins5, float 2.0, i32 6
+  %ins7 = insertelement <8 x float> %ins6, float %a7, i32 7
+  store <8 x float> %ins7, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, double %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4f64:
 ; CHECK:       # %bb.0: # %entry
@@ -724,9 +893,7 @@ define void @buildvector_v4f64_partial(ptr %dst, double %a0, double %a3) nounwin
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $xr1
 ; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
-; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
-; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 3
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -737,3 +904,22 @@ entry:
   store <4 x double> %ins3, ptr %dst
   ret void
 }
+
+define void @buildvector_v4f64_with_constant(ptr %dst, double %a0, double %a3) nounwind {
+; CHECK-LABEL: buildvector_v4f64_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $xr1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvrepli.b $xr2, 0
+; CHECK-NEXT:    xvinsve0.d $xr2, $xr0, 0
+; CHECK-NEXT:    xvinsve0.d $xr2, $xr1, 3
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x double> undef, double %a0, i32 0
+  %ins1 = insertelement <4 x double> %ins0, double 0.0, i32 1
+  %ins2 = insertelement <4 x double> %ins1, double 0.0, i32 2
+  %ins3 = insertelement <4 x double> %ins2, double %a3, i32 3
+  store <4 x double> %ins3, ptr %dst
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
index c61b784..06d4a5d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
@@ -524,9 +524,8 @@ define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) {
 ; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 0
 ; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
 ; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
-; CHECK-NEXT:    vpackev.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 1
+; CHECK-NEXT:    vslli.h $vr0, $vr1, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -539,24 +538,20 @@ define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) {
 define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) {
 ; CHECK-LABEL: xvmsk_ne_v4i32_concat_poison:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vseqi.w $vr0, $vr0, 0
 ; CHECK-NEXT:    vrepli.b $vr1, -1
 ; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
-; CHECK-NEXT:    st.h $a0, $sp, 6
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 2
-; CHECK-NEXT:    st.h $a0, $sp, 4
-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
-; CHECK-NEXT:    st.h $a0, $sp, 2
 ; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 0
-; CHECK-NEXT:    st.h $a0, $sp, 0
-; CHECK-NEXT:    vld $vr0, $sp, 0
-; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 0
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 1
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 2
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 2
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
+; CHECK-NEXT:    vinsgr2vr.h $vr1, $a0, 3
+; CHECK-NEXT:    vslli.h $vr0, $vr1, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %tobool = icmp ne <4 x i32> %vec, zeroinitializer
   %insertvec = shufflevector <4 x i1> %tobool, <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -567,23 +562,19 @@ define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) {
 define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) {
 ; CHECK-LABEL: xvmsk_ogt_v4f64_concat_poison:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    xvrepli.b $xr1, 0
 ; CHECK-NEXT:    xvfcmp.clt.d $xr0, $xr1, $xr0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 1
-; CHECK-NEXT:    xvpickve2gr.d $a2, $xr0, 2
-; CHECK-NEXT:    xvpickve2gr.d $a3, $xr0, 3
-; CHECK-NEXT:    st.h $a3, $sp, 6
-; CHECK-NEXT:    st.h $a2, $sp, 4
-; CHECK-NEXT:    st.h $a1, $sp, 2
-; CHECK-NEXT:    st.h $a0, $sp, 0
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
+; CHECK-NEXT:    xvpickve2gr.d $a1, $xr0, 2
+; CHECK-NEXT:    xvpickve2gr.d $a2, $xr0, 1
+; CHECK-NEXT:    xvpickve2gr.d $a3, $xr0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 1
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 2
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 3
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
   %tobool = fcmp ogt <4 x double> %vec, zeroinitializer
   %insertvec = shufflevector <4 x i1> %tobool, <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
index 62ea5cb..030b822c 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll
@@ -137,20 +137,20 @@ define <2 x float> @exp10_v2f32(<2 x float> %x) #0 {
 ; LA64-NEXT:    addi.d $sp, $sp, -48
 ; LA64-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
 ; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(exp10f)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
 ; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
+; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
 ; LA64-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
 ; LA64-NEXT:    addi.d $sp, $sp, 48
 ; LA64-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
index 383d63c..4ac38a9 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
@@ -350,7 +350,7 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
 ; LA64-NEXT:    addi.d $sp, $sp, -80
 ; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
 ; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
@@ -358,14 +358,14 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
 ; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
 ; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
+; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
 ; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
@@ -377,9 +377,9 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
-; LA64-NEXT:    vpackev.w $vr1, $vr0, $vr1
+; LA64-NEXT:    fmov.s $fa1, $fa0
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.w $vr1, $vr0, 16
 ; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
 ; LA64-NEXT:    addi.d $sp, $sp, 80
@@ -439,48 +439,60 @@ define { <3 x float>, <3 x float> } @test_sincos_v3f32(<3 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v3f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -112
-; LA64-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -96
+; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
 ; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 88
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vst $vr0, $sp, 64 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 84
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 64 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
+; LA64-NEXT:    vst $vr0, $sp, 64 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
-; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
+; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
 ; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 80
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 64 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
+; LA64-NEXT:    vst $vr1, $sp, 64 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 72
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 68
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 64
-; LA64-NEXT:    vld $vr0, $sp, 80
-; LA64-NEXT:    vld $vr1, $sp, 64
-; LA64-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 112
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vextrins.w $vr1, $vr0, 32
+; LA64-NEXT:    vld $vr0, $sp, 64 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 96
 ; LA64-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.sincos.v3f32(<3 x float> %a)
   ret { <3 x float>, <3 x float> } %result
diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
index 4dda012..9517558 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
@@ -275,16 +275,13 @@ entry:
 define void @buildvector_v16i8_partial(ptr %dst, i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) nounwind {
 ; CHECK-LABEL: buildvector_v16i8_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -16
-; CHECK-NEXT:    st.b $a6, $sp, 15
-; CHECK-NEXT:    st.b $a5, $sp, 12
-; CHECK-NEXT:    st.b $a4, $sp, 11
-; CHECK-NEXT:    st.b $a3, $sp, 8
-; CHECK-NEXT:    st.b $a2, $sp, 6
-; CHECK-NEXT:    st.b $a1, $sp, 2
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 2
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 6
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 8
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a4, 11
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a5, 12
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a6, 15
 ; CHECK-NEXT:    vst $vr0, $a0, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
 entry:
   %ins0  = insertelement <16 x i8> undef,  i8 undef, i32 0
@@ -307,6 +304,40 @@ entry:
   ret void
 }
 
+define void @buildvector_v16i8_with_constant(ptr %dst, i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) nounwind {
+; CHECK-LABEL: buildvector_v16i8_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vrepli.b $vr0, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 4
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 6
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a4, 8
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a5, 11
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a6, 12
+; CHECK-NEXT:    vinsgr2vr.b $vr0, $a7, 15
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0  = insertelement <16 x i8> undef,  i8  %a0, i32 0
+  %ins1  = insertelement <16 x i8> %ins0,  i8    0, i32 1
+  %ins2  = insertelement <16 x i8> %ins1,  i8    0, i32 2
+  %ins3  = insertelement <16 x i8> %ins2,  i8    0, i32 3
+  %ins4  = insertelement <16 x i8> %ins3,  i8  %a4, i32 4
+  %ins5  = insertelement <16 x i8> %ins4,  i8    0, i32 5
+  %ins6  = insertelement <16 x i8> %ins5,  i8  %a6, i32 6
+  %ins7  = insertelement <16 x i8> %ins6,  i8    0, i32 7
+  %ins8  = insertelement <16 x i8> %ins7,  i8  %a8, i32 8
+  %ins9  = insertelement <16 x i8> %ins8,  i8    0, i32 9
+  %ins10 = insertelement <16 x i8> %ins9,  i8    0, i32 10
+  %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11
+  %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12
+  %ins13 = insertelement <16 x i8> %ins12, i8    0, i32 13
+  %ins14 = insertelement <16 x i8> %ins13, i8    0, i32 14
+  %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15
+  store <16 x i8> %ins15, ptr %dst
+  ret void
+}
+
 define void @buildvector_v8i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
 ; CHECK-LABEL: buildvector_v8i16:
 ; CHECK:       # %bb.0: # %entry
@@ -337,14 +368,11 @@ entry:
 define void @buildvector_v8i16_partial(ptr %dst, i16 %a1, i16 %a3, i16 %a4, i16 %a5) nounwind {
 ; CHECK-LABEL: buildvector_v8i16_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -16
-; CHECK-NEXT:    st.h $a4, $sp, 10
-; CHECK-NEXT:    st.h $a3, $sp, 8
-; CHECK-NEXT:    st.h $a2, $sp, 6
-; CHECK-NEXT:    st.h $a1, $sp, 2
-; CHECK-NEXT:    vld $vr0, $sp, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 1
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 3
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 4
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a4, 5
 ; CHECK-NEXT:    vst $vr0, $a0, 0
-; CHECK-NEXT:    addi.d $sp, $sp, 16
 ; CHECK-NEXT:    ret
 entry:
   %ins0 = insertelement <8 x i16> undef, i16 undef, i32 0
@@ -359,6 +387,29 @@ entry:
   ret void
 }
 
+define void @buildvector_v8i16_with_constant(ptr %dst, i16 %a0, i16 %a3, i16 %a4, i16 %a5) nounwind {
+; CHECK-LABEL: buildvector_v8i16_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vrepli.b $vr0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 3
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 4
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a4, 5
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <8 x i16> undef, i16   %a0, i32 0
+  %ins1 = insertelement <8 x i16> %ins0, i16     0, i32 1
+  %ins2 = insertelement <8 x i16> %ins1, i16 undef, i32 2
+  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
+  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
+  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
+  %ins6 = insertelement <8 x i16> %ins5, i16     0, i32 6
+  %ins7 = insertelement <8 x i16> %ins6, i16 undef, i32 7
+  store <8 x i16> %ins7, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4i32:
 ; CHECK:       # %bb.0: # %entry
@@ -380,11 +431,8 @@ entry:
 define void @buildvector_v4i32_partial(ptr %dst, i32 %a0, i32 %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4i32_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a3, %pc_hi20(.LCPI23_0)
-; CHECK-NEXT:    vld $vr0, $a3, %pc_lo12(.LCPI23_0)
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a2, 0
-; CHECK-NEXT:    vshuf.w $vr0, $vr2, $vr1
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a2, 3
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -396,6 +444,24 @@ entry:
   ret void
 }
 
+define void @buildvector_v4i32_with_constant(ptr %dst, i32 %a0, i32 %a2, i32 %a3) nounwind {
+; CHECK-LABEL: buildvector_v4i32_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vrepli.w $vr0, 2
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a2, 2
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a3, 3
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32   2, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
+  store <4 x i32> %ins3, ptr %dst
+  ret void
+}
+
 define void @buildvector_v2i64(ptr %dst, i64 %a0, i64 %a1) nounwind {
 ; CHECK-LABEL: buildvector_v2i64:
 ; CHECK:       # %bb.0: # %entry
@@ -423,6 +489,20 @@ entry:
   ret void
 }
 
+define void @buildvector_v2i64_with_constant(ptr %dst, i64 %a1) nounwind {
+; CHECK-LABEL: buildvector_v2i64_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vrepli.b $vr0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a1, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <2 x i64> undef, i64   0, i32 0
+  %ins1 = insertelement <2 x i64> %ins0, i64 %a1, i32 1
+  store <2 x i64> %ins1, ptr %dst
+  ret void
+}
+
 define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4f32:
 ; CHECK:       # %bb.0: # %entry
@@ -447,12 +527,10 @@ entry:
 define void @buildvector_v4f32_partial(ptr %dst, float %a0, float %a3) nounwind {
 ; CHECK-LABEL: buildvector_v4f32_partial:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI27_0)
-; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI27_0)
 ; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
 ; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
-; CHECK-NEXT:    vst $vr2, $a0, 0
+; CHECK-NEXT:    vextrins.w $vr0, $vr1, 48
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %ins0 = insertelement <4 x float> undef, float   %a0, i32 0
@@ -463,6 +541,27 @@ entry:
   ret void
 }
 
+define void @buildvector_v4f32_with_constant(ptr %dst, float %a1, float %a2, float %a3) nounwind {
+; CHECK-LABEL: buildvector_v4f32_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f2 killed $f2 def $vr2
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vrepli.b $vr3, 0
+; CHECK-NEXT:    vextrins.w $vr3, $vr0, 16
+; CHECK-NEXT:    vextrins.w $vr3, $vr1, 32
+; CHECK-NEXT:    vextrins.w $vr3, $vr2, 48
+; CHECK-NEXT:    vst $vr3, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <4 x float> undef, float 0.0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float %a1, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float %a2, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float %a3, i32 3
+  store <4 x float> %ins3, ptr %dst
+  ret void
+}
+
 define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind {
 ; CHECK-LABEL: buildvector_v2f64:
 ; CHECK:       # %bb.0: # %entry
@@ -482,7 +581,7 @@ define void @buildvector_v2f64_partial(ptr %dst, double %a1) nounwind {
 ; CHECK-LABEL: buildvector_v2f64_partial:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    vextrins.d $vr0, $vr0, 16
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -492,6 +591,21 @@ entry:
   ret void
 }
 
+define void @buildvector_v2f64_with_constant(ptr %dst, double %a0) nounwind {
+; CHECK-LABEL: buildvector_v2f64_with_constant:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT:    vldi $vr1, -1024
+; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %ins0 = insertelement <2 x double> undef, double %a0, i32 0
+  %ins1 = insertelement <2 x double> %ins0, double 2.0, i32 1
+  store <2 x double> %ins1, ptr %dst
+  ret void
+}
+
 ;; If `isShuffleMaskLegal` returns true, it will lead to an infinite loop.
 define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
 ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-expect-id.mir b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-expect-id.mir
new file mode 100644
index 0000000..4179ff2
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-expect-id.mir
@@ -0,0 +1,29 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: not llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+
+--- |
+  define void @expect_id(ptr %ptr, float %data) #0 {
+    %1 = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0
+    ret void
+  }
+
+  attributes #0 = { "target-cpu"="gfx1200" }
+
+  !0 = !{i32 5, i32 6}
+...
+
+---
+name: expect_id
+
+body: |
+  bb.1 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK: expected metadata id after '!'
+    %2:vgpr_32 = COPY $vgpr0
+    %3:vgpr_32 = COPY $vgpr1
+    %0:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %1:vgpr_32 = COPY $vgpr2
+    FLAT_ATOMIC_ADD_F32 %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !!)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-parse.mir b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-parse.mir
new file mode 100644
index 0000000..7fe6aa9
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-parse.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=none -o - %s | FileCheck %s
+
+
+--- |
+  define void @test_parsing_printing(ptr %ptr, float %data) {
+    %1 = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0
+    ret void
+  }
+
+  !0 = !{i32 5, i32 6}
+...
+
+---
+name: test_parsing_printing
+
+body: |
+  bb.1 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: test_parsing_printing
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; CHECK-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %2:vgpr_32 = COPY $vgpr0
+    %3:vgpr_32 = COPY $vgpr1
+    %0:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %1:vgpr_32 = COPY $vgpr2
+    FLAT_ATOMIC_ADD_F32 %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-undefine-matadata.mir b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-undefine-matadata.mir
new file mode 100644
index 0000000..505b514
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-undefine-matadata.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: not llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+
+
+--- |
+  define void @undefined_metadata(ptr %ptr, float %data) {
+    %1 = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0
+    ret void
+  }
+
+  !0 = !{i32 5, i32 6}
+...
+
+---
+name: undefined_metadata
+
+body: |
+  bb.1 (%ir-block.0):
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK: use of undefined metadata '!3'
+    %2:vgpr_32 = COPY $vgpr0
+    %3:vgpr_32 = COPY $vgpr1
+    %0:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %1:vgpr_32 = COPY $vgpr2
+    FLAT_ATOMIC_ADD_F32 %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !3)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir
new file mode 100644
index 0000000..cb78898
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir
@@ -0,0 +1,31 @@
+# Test MIR printer and parser to check if a call instruction with multiple
+# callee types are handled correctly.
+
+# RUN: llc -mtriple=x86_64 --call-graph-section %s -run-pass=none -o - | FileCheck --match-full-lines %s
+# CHECK: name: ambiguous_caller
+# CHECK: callSites:
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: {{.*}}, calleeTypeIds:
+# CHECK-NEXT: [ 1234, 5678 ] }
+
+--- |
+  define ptr @ambiguous_caller() {
+  entry:
+    %fn = alloca ptr, align 8
+    %call1 = call ptr %fn(i64 4), !callee_type !0
+    ret ptr %call1
+  }
+  
+  !0 = !{!1, !2}
+  !1 = !{i64 0, !"callee_type0.generalized"}
+  !2 = !{i64 0, !"callee_type2.generalized"}
+...
+---
+name:            ambiguous_caller
+callSites:
+  - { bb: 0, offset: 1, fwdArgRegs: [], calleeTypeIds: [ 1234, 5678 ] }
+body:             |
+  bb.0.entry:
+    %0:gr64 = MOV32ri64 4
+    CALL64r killed %0, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    RET 0, $rax
+...
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-direct-calls-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-direct-calls-typeid.mir
new file mode 100644
index 0000000..faa021c
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-direct-calls-typeid.mir
@@ -0,0 +1,54 @@
+# Test MIR printer and parser to NOT have `CalleeTypeIds` field in callSites.
+# `CalleeTypeId` is used for propagating call site type identifiers for
+# indirect targets only. This test does not contain any indirect targets.
+
+# RUN: llc -mtriple=x86_64 --call-graph-section %s -run-pass=none -o - | FileCheck --match-full-lines %s
+# CHECK-NOT: calleeTypeIds
+# CHECK: name: bar
+# CHECK: callSites:
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [] }
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [] }
+# CHECK: name: foo
+# CHECK: callSites:
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [] }
+
+--- |
+  declare i32 @fizz(i32, i32)
+  
+  declare i32 @buzz(i32, i32)
+  
+  define i32 @bar(i32 %x, i32 %y) !type !0 {
+  entry:
+    %call = call i32 @buzz(i32 %x, i32 %x)
+    %call1 = call i32 @fizz(i32 %x, i32 %x)
+    ret i32 0
+  }
+  
+  define i32 @foo(i32 %x, i32 %y) !type !0 {
+  entry:
+    %call1 = call i32 @bar(i32 %x, i32 %x)
+    ret i32 0
+  }
+  
+  !0 = !{i64 0, !"_ZTSFiiiE.generalized"}
+...
+---
+name:            bar
+callSites:
+  - { bb: 0, offset: 0, fwdArgRegs: [] }
+  - { bb: 0, offset: 1, fwdArgRegs: [] }
+body:             |
+  bb.0.entry:
+    CALL64pcrel32 target-flags(x86-plt) @buzz, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    CALL64pcrel32 target-flags(x86-plt) @fizz, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+
+...
+---
+name:            foo
+callSites:
+  - { bb: 0, offset: 0, fwdArgRegs: [] }
+body:             |
+  bb.0.entry:
+    CALL64pcrel32 target-flags(x86-plt) @bar, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+
+...
diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir
new file mode 100644
index 0000000..303b8fa
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir
@@ -0,0 +1,28 @@
+# Test MIR printer and parser for type id field in callSites. It is used
+# for propagating call site type identifiers to emit in the call graph section.
+
+# RUN: llc -mtriple=x86_64 --call-graph-section %s -run-pass=none -o - | FileCheck --match-full-lines %s
+# CHECK: name: call_foo
+# CHECK: callSites:
+# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], calleeTypeIds:
+# CHECK-NEXT: [ 123456789 ] }
+
+--- |
+  define i32 @call_foo() {
+  entry:
+    %0 = load ptr, ptr null, align 8
+    call void %0(i8 0), !callee_type !0
+    ret i32 0
+  }
+
+  !0 = !{!1}
+  !1 = !{i64 0, !"_ZTSFvcE.generalized"}
+...
+---
+name:            call_foo
+callSites:
+  - { bb: 0, offset: 0, fwdArgRegs: [], calleeTypeIds: [ 123456789 ] }
+body:             |
+  bb.0.entry:    
+    CALL64m $noreg, 1, $noreg, 0, $noreg, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp :: (load (s64) from `ptr null`)
+...
diff --git a/llvm/test/CodeGen/Mips/abiflags-soft-float.ll b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll
new file mode 100644
index 0000000..01821f2
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll
@@ -0,0 +1,12 @@
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o tmp.o
+; RUN: llvm-readobj -A tmp.o | FileCheck %s -check-prefix=OBJ
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | \
+; RUN: FileCheck %s -check-prefix=ASM
+
+; OBJ: FP ABI: Soft float
+; ASM: .module	softfloat 
+
+define dso_local void @asm_is_null() "use-soft-float"="true" {
+  call void asm sideeffect "", ""()
+  ret void
+}
diff --git a/llvm/test/CodeGen/Mips/llvm.frexp.ll b/llvm/test/CodeGen/Mips/llvm.frexp.ll
new file mode 100644
index 0000000..3226766
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm.frexp.ll
@@ -0,0 +1,651 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=mipsel < %s | FileCheck -check-prefix=MIPSEL %s
+; RUN: llc -mtriple=mips < %s | FileCheck %s -check-prefixes=SOFT-FLOAT-32
+; RUN: llc -mtriple=mips64 < %s | FileCheck %s -check-prefixes=SOFT-FLOAT-64
+
+define { half, i32 } @test_frexp_f16_i32(half %a) nounwind {
+; MIPSEL-LABEL: test_frexp_f16_i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -24
+; MIPSEL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    nop
+; MIPSEL-NEXT:    addiu $5, $sp, 16
+; MIPSEL-NEXT:    jal frexpf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    lw $3, 16($sp)
+; MIPSEL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_f16_i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -24
+; SOFT-FLOAT-32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    nop
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 16
+; SOFT-FLOAT-32-NEXT:    jal frexpf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    lw $3, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_f16_i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    sll $4, $4, 0
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    jal frexpf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    lw $3, 4($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+%result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
+  ret { half, i32 } %result
+}
+
+define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) nounwind {
+; MIPSEL-LABEL: test_frexp_v2f16_v2i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $18, 24($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $17, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $5
+; MIPSEL-NEXT:    move $17, $4
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    srl $4, $5, 16
+; MIPSEL-NEXT:    addiu $5, $17, 12
+; MIPSEL-NEXT:    jal frexpf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    move $18, $2
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    move $4, $16
+; MIPSEL-NEXT:    addiu $5, $17, 8
+; MIPSEL-NEXT:    jal frexpf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    sh $18, 2($17)
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    sh $2, 0($17)
+; MIPSEL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $17, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $18, 24($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_v2f16_v2i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $18, 24($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $17, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $5
+; SOFT-FLOAT-32-NEXT:    move $17, $4
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    move $4, $5
+; SOFT-FLOAT-32-NEXT:    addiu $5, $17, 12
+; SOFT-FLOAT-32-NEXT:    jal frexpf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    move $18, $2
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    srl $4, $16, 16
+; SOFT-FLOAT-32-NEXT:    addiu $5, $17, 8
+; SOFT-FLOAT-32-NEXT:    jal frexpf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    sh $18, 2($17)
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    sh $2, 0($17)
+; SOFT-FLOAT-32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $17, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $18, 24($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_v2f16_v2i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $18, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $17, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 0($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $16, $5
+; SOFT-FLOAT-64-NEXT:    move $17, $4
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    sll $4, $5, 0
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $17, 12
+; SOFT-FLOAT-64-NEXT:    jal frexpf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    move $18, $2
+; SOFT-FLOAT-64-NEXT:    sll $1, $16, 0
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    srl $4, $1, 16
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $17, 8
+; SOFT-FLOAT-64-NEXT:    jal frexpf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    sh $18, 2($17)
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    sh $2, 0($17)
+; SOFT-FLOAT-64-NEXT:    ld $16, 0($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $17, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $18, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 32
+  %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
+  ret { <2 x half>, <2 x i32> } %result
+}
+
+define { float, i32 } @test_frexp_f32_i32(float %a) nounwind {
+; MIPSEL-LABEL: test_frexp_f32_i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -24
+; MIPSEL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    jal frexpf
+; MIPSEL-NEXT:    addiu $5, $sp, 16
+; MIPSEL-NEXT:    lw $2, 16($sp)
+; MIPSEL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_f32_i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -24
+; SOFT-FLOAT-32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    jal frexpf
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 16
+; SOFT-FLOAT-32-NEXT:    lw $2, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_f32_i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    jal frexpf
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    lw $2, 4($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+  %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  ret { float, i32 } %result
+}
+
+define { float, i32 } @test_frexp_f32_i32_tailcall(float %a) nounwind {
+; MIPSEL-LABEL: test_frexp_f32_i32_tailcall:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -24
+; MIPSEL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    jal frexpf
+; MIPSEL-NEXT:    addiu $5, $sp, 16
+; MIPSEL-NEXT:    lw $2, 16($sp)
+; MIPSEL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_f32_i32_tailcall:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -24
+; SOFT-FLOAT-32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    jal frexpf
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 16
+; SOFT-FLOAT-32-NEXT:    lw $2, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_f32_i32_tailcall:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    jal frexpf
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    lw $2, 4($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+  %result = tail call { float, i32 } @llvm.frexp.f32.i32(float %a)
+  ret { float, i32 } %result
+}
+
+define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) nounwind {
+; MIPSEL-LABEL: test_frexp_v2f32_v2i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $6
+; MIPSEL-NEXT:    move $17, $4
+; MIPSEL-NEXT:    mtc1 $7, $f12
+; MIPSEL-NEXT:    jal frexpf
+; MIPSEL-NEXT:    addiu $5, $4, 12
+; MIPSEL-NEXT:    swc1 $f0, 4($17)
+; MIPSEL-NEXT:    mtc1 $16, $f12
+; MIPSEL-NEXT:    jal frexpf
+; MIPSEL-NEXT:    addiu $5, $17, 8
+; MIPSEL-NEXT:    swc1 $f0, 0($17)
+; MIPSEL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_v2f32_v2i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $6
+; SOFT-FLOAT-32-NEXT:    move $17, $4
+; SOFT-FLOAT-32-NEXT:    mtc1 $7, $f12
+; SOFT-FLOAT-32-NEXT:    jal frexpf
+; SOFT-FLOAT-32-NEXT:    addiu $5, $4, 12
+; SOFT-FLOAT-32-NEXT:    swc1 $f0, 4($17)
+; SOFT-FLOAT-32-NEXT:    mtc1 $16, $f12
+; SOFT-FLOAT-32-NEXT:    jal frexpf
+; SOFT-FLOAT-32-NEXT:    addiu $5, $17, 8
+; SOFT-FLOAT-32-NEXT:    swc1 $f0, 0($17)
+; SOFT-FLOAT-32-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_v2f32_v2i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $16, $4
+; SOFT-FLOAT-64-NEXT:    sll $1, $16, 0
+; SOFT-FLOAT-64-NEXT:    mtc1 $1, $f12
+; SOFT-FLOAT-64-NEXT:    jal frexpf
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 8
+; SOFT-FLOAT-64-NEXT:    mfc1 $1, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64-NEXT:    dsrl $2, $16, 32
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 12
+; SOFT-FLOAT-64-NEXT:    dsrl $16, $1, 32
+; SOFT-FLOAT-64-NEXT:    sll $1, $2, 0
+; SOFT-FLOAT-64-NEXT:    jal frexpf
+; SOFT-FLOAT-64-NEXT:    mtc1 $1, $f12
+; SOFT-FLOAT-64-NEXT:    mfc1 $1, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64-NEXT:    or $2, $16, $1
+; SOFT-FLOAT-64-NEXT:    lw $1, 12($sp)
+; SOFT-FLOAT-64-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64-NEXT:    lw $3, 8($sp)
+; SOFT-FLOAT-64-NEXT:    dsll $3, $3, 32
+; SOFT-FLOAT-64-NEXT:    dsrl $3, $3, 32
+; SOFT-FLOAT-64-NEXT:    or $3, $3, $1
+; SOFT-FLOAT-64-NEXT:    ld $16, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 32
+  %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
+  ret { <2 x float>, <2 x i32> } %result
+}
+
+define { double, i32 } @test_frexp_f64_i32(double %a) nounwind {
+; MIPSEL-LABEL: test_frexp_f64_i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -24
+; MIPSEL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    jal frexp
+; MIPSEL-NEXT:    addiu $6, $sp, 16
+; MIPSEL-NEXT:    lw $2, 16($sp)
+; MIPSEL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_f64_i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -24
+; SOFT-FLOAT-32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    jal frexp
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 16
+; SOFT-FLOAT-32-NEXT:    lw $2, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 24
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_f64_i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    jal frexp
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    lw $2, 4($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+  %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
+  ret { double, i32 } %result
+}
+
+define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) nounwind {
+; MIPSEL-LABEL: test_frexp_v2f64_v2i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -48
+; MIPSEL-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $18, 40($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $17, 36($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $7
+; MIPSEL-NEXT:    move $17, $6
+; MIPSEL-NEXT:    move $18, $4
+; MIPSEL-NEXT:    lw $1, 64($sp)
+; MIPSEL-NEXT:    lw $2, 68($sp)
+; MIPSEL-NEXT:    sw $2, 28($sp)
+; MIPSEL-NEXT:    sw $1, 24($sp)
+; MIPSEL-NEXT:    addiu $6, $4, 20
+; MIPSEL-NEXT:    jal frexp
+; MIPSEL-NEXT:    ldc1 $f12, 24($sp)
+; MIPSEL-NEXT:    sdc1 $f0, 8($18)
+; MIPSEL-NEXT:    sw $16, 20($sp)
+; MIPSEL-NEXT:    sw $17, 16($sp)
+; MIPSEL-NEXT:    addiu $6, $18, 16
+; MIPSEL-NEXT:    jal frexp
+; MIPSEL-NEXT:    ldc1 $f12, 16($sp)
+; MIPSEL-NEXT:    sdc1 $f0, 0($18)
+; MIPSEL-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $17, 36($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $18, 40($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 48
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_v2f64_v2i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -48
+; SOFT-FLOAT-32-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $18, 40($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $17, 36($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $7
+; SOFT-FLOAT-32-NEXT:    move $17, $6
+; SOFT-FLOAT-32-NEXT:    move $18, $4
+; SOFT-FLOAT-32-NEXT:    lw $1, 64($sp)
+; SOFT-FLOAT-32-NEXT:    lw $2, 68($sp)
+; SOFT-FLOAT-32-NEXT:    sw $2, 28($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 24($sp)
+; SOFT-FLOAT-32-NEXT:    addiu $6, $4, 20
+; SOFT-FLOAT-32-NEXT:    jal frexp
+; SOFT-FLOAT-32-NEXT:    ldc1 $f12, 24($sp)
+; SOFT-FLOAT-32-NEXT:    sdc1 $f0, 8($18)
+; SOFT-FLOAT-32-NEXT:    sw $16, 20($sp)
+; SOFT-FLOAT-32-NEXT:    sw $17, 16($sp)
+; SOFT-FLOAT-32-NEXT:    addiu $6, $18, 16
+; SOFT-FLOAT-32-NEXT:    jal frexp
+; SOFT-FLOAT-32-NEXT:    ldc1 $f12, 16($sp)
+; SOFT-FLOAT-32-NEXT:    sdc1 $f0, 0($18)
+; SOFT-FLOAT-32-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $17, 36($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $18, 40($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 48
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_v2f64_v2i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $17, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $16, $5
+; SOFT-FLOAT-64-NEXT:    move $17, $4
+; SOFT-FLOAT-64-NEXT:    dmtc1 $6, $f12
+; SOFT-FLOAT-64-NEXT:    jal frexp
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $4, 20
+; SOFT-FLOAT-64-NEXT:    sdc1 $f0, 8($17)
+; SOFT-FLOAT-64-NEXT:    dmtc1 $16, $f12
+; SOFT-FLOAT-64-NEXT:    jal frexp
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $17, 16
+; SOFT-FLOAT-64-NEXT:    sdc1 $f0, 0($17)
+; SOFT-FLOAT-64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $17, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 32
+  %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
+  ret { <2 x double>, <2 x i32> } %result
+}
+
+define { fp128, i32 } @test_frexp_fp128_i32(fp128 %a) nounwind {
+; MIPSEL-LABEL: test_frexp_fp128_i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -40
+; MIPSEL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $1, $7
+; MIPSEL-NEXT:    move $16, $4
+; MIPSEL-NEXT:    addiu $2, $sp, 28
+; MIPSEL-NEXT:    sw $2, 16($sp)
+; MIPSEL-NEXT:    lw $7, 56($sp)
+; MIPSEL-NEXT:    move $4, $5
+; MIPSEL-NEXT:    move $5, $6
+; MIPSEL-NEXT:    jal frexpl
+; MIPSEL-NEXT:    move $6, $1
+; MIPSEL-NEXT:    sw $5, 12($16)
+; MIPSEL-NEXT:    sw $4, 8($16)
+; MIPSEL-NEXT:    sw $3, 4($16)
+; MIPSEL-NEXT:    sw $2, 0($16)
+; MIPSEL-NEXT:    lw $1, 28($sp)
+; MIPSEL-NEXT:    sw $1, 16($16)
+; MIPSEL-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 40
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_fp128_i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -40
+; SOFT-FLOAT-32-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $1, $7
+; SOFT-FLOAT-32-NEXT:    move $16, $4
+; SOFT-FLOAT-32-NEXT:    addiu $2, $sp, 28
+; SOFT-FLOAT-32-NEXT:    sw $2, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $7, 56($sp)
+; SOFT-FLOAT-32-NEXT:    move $4, $5
+; SOFT-FLOAT-32-NEXT:    move $5, $6
+; SOFT-FLOAT-32-NEXT:    jal frexpl
+; SOFT-FLOAT-32-NEXT:    move $6, $1
+; SOFT-FLOAT-32-NEXT:    sw $5, 12($16)
+; SOFT-FLOAT-32-NEXT:    sw $4, 8($16)
+; SOFT-FLOAT-32-NEXT:    sw $3, 4($16)
+; SOFT-FLOAT-32-NEXT:    sw $2, 0($16)
+; SOFT-FLOAT-32-NEXT:    lw $1, 28($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 16($16)
+; SOFT-FLOAT-32-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 40
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_fp128_i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    dmfc1 $4, $f12
+; SOFT-FLOAT-64-NEXT:    dmfc1 $5, $f13
+; SOFT-FLOAT-64-NEXT:    jal frexpl
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 4
+; SOFT-FLOAT-64-NEXT:    lw $4, 4($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+  %result = call { fp128, i32 } @llvm.frexp.fp128.i32(fp128 %a)
+  ret { fp128, i32 } %result
+}
+
+define { <2 x fp128>, <2 x i32> } @test_frexp_v2fp128_v2i32(<2 x fp128> %a) nounwind {
+; MIPSEL-LABEL: test_frexp_v2fp128_v2i32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -48
+; MIPSEL-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $18, 40($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $17, 36($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $7
+; MIPSEL-NEXT:    move $17, $6
+; MIPSEL-NEXT:    move $18, $4
+; MIPSEL-NEXT:    addiu $1, $sp, 28
+; MIPSEL-NEXT:    sw $1, 16($sp)
+; MIPSEL-NEXT:    lw $4, 72($sp)
+; MIPSEL-NEXT:    lw $5, 76($sp)
+; MIPSEL-NEXT:    lw $6, 80($sp)
+; MIPSEL-NEXT:    lw $7, 84($sp)
+; MIPSEL-NEXT:    jal frexpl
+; MIPSEL-NEXT:    nop
+; MIPSEL-NEXT:    addiu $1, $sp, 24
+; MIPSEL-NEXT:    sw $1, 16($sp)
+; MIPSEL-NEXT:    lw $7, 68($sp)
+; MIPSEL-NEXT:    lw $6, 64($sp)
+; MIPSEL-NEXT:    sw $5, 28($18)
+; MIPSEL-NEXT:    sw $4, 24($18)
+; MIPSEL-NEXT:    sw $3, 20($18)
+; MIPSEL-NEXT:    sw $2, 16($18)
+; MIPSEL-NEXT:    move $4, $17
+; MIPSEL-NEXT:    jal frexpl
+; MIPSEL-NEXT:    move $5, $16
+; MIPSEL-NEXT:    sw $5, 12($18)
+; MIPSEL-NEXT:    sw $4, 8($18)
+; MIPSEL-NEXT:    sw $3, 4($18)
+; MIPSEL-NEXT:    sw $2, 0($18)
+; MIPSEL-NEXT:    lw $1, 28($sp)
+; MIPSEL-NEXT:    sw $1, 36($18)
+; MIPSEL-NEXT:    lw $1, 24($sp)
+; MIPSEL-NEXT:    sw $1, 32($18)
+; MIPSEL-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $17, 36($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $18, 40($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 48
+;
+; SOFT-FLOAT-32-LABEL: test_frexp_v2fp128_v2i32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -48
+; SOFT-FLOAT-32-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $18, 40($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $17, 36($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $7
+; SOFT-FLOAT-32-NEXT:    move $17, $6
+; SOFT-FLOAT-32-NEXT:    move $18, $4
+; SOFT-FLOAT-32-NEXT:    addiu $1, $sp, 28
+; SOFT-FLOAT-32-NEXT:    sw $1, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $4, 72($sp)
+; SOFT-FLOAT-32-NEXT:    lw $5, 76($sp)
+; SOFT-FLOAT-32-NEXT:    lw $6, 80($sp)
+; SOFT-FLOAT-32-NEXT:    lw $7, 84($sp)
+; SOFT-FLOAT-32-NEXT:    jal frexpl
+; SOFT-FLOAT-32-NEXT:    nop
+; SOFT-FLOAT-32-NEXT:    addiu $1, $sp, 24
+; SOFT-FLOAT-32-NEXT:    sw $1, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $7, 68($sp)
+; SOFT-FLOAT-32-NEXT:    lw $6, 64($sp)
+; SOFT-FLOAT-32-NEXT:    sw $5, 28($18)
+; SOFT-FLOAT-32-NEXT:    sw $4, 24($18)
+; SOFT-FLOAT-32-NEXT:    sw $3, 20($18)
+; SOFT-FLOAT-32-NEXT:    sw $2, 16($18)
+; SOFT-FLOAT-32-NEXT:    move $4, $17
+; SOFT-FLOAT-32-NEXT:    jal frexpl
+; SOFT-FLOAT-32-NEXT:    move $5, $16
+; SOFT-FLOAT-32-NEXT:    sw $5, 12($18)
+; SOFT-FLOAT-32-NEXT:    sw $4, 8($18)
+; SOFT-FLOAT-32-NEXT:    sw $3, 4($18)
+; SOFT-FLOAT-32-NEXT:    sw $2, 0($18)
+; SOFT-FLOAT-32-NEXT:    lw $1, 28($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 36($18)
+; SOFT-FLOAT-32-NEXT:    lw $1, 24($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 32($18)
+; SOFT-FLOAT-32-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $17, 36($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $18, 40($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 48
+;
+; SOFT-FLOAT-64-LABEL: test_frexp_v2fp128_v2i32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -64
+; SOFT-FLOAT-64-NEXT:    sd $ra, 56($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $20, 48($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $19, 40($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $18, 32($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $17, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $16, $6
+; SOFT-FLOAT-64-NEXT:    move $17, $5
+; SOFT-FLOAT-64-NEXT:    move $18, $4
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 12
+; SOFT-FLOAT-64-NEXT:    move $4, $7
+; SOFT-FLOAT-64-NEXT:    jal frexpl
+; SOFT-FLOAT-64-NEXT:    move $5, $8
+; SOFT-FLOAT-64-NEXT:    move $19, $2
+; SOFT-FLOAT-64-NEXT:    move $20, $3
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64-NEXT:    lw $1, 12($sp)
+; SOFT-FLOAT-64-NEXT:    sw $1, 36($18)
+; SOFT-FLOAT-64-NEXT:    move $4, $17
+; SOFT-FLOAT-64-NEXT:    jal frexpl
+; SOFT-FLOAT-64-NEXT:    move $5, $16
+; SOFT-FLOAT-64-NEXT:    lw $1, 8($sp)
+; SOFT-FLOAT-64-NEXT:    sw $1, 32($18)
+; SOFT-FLOAT-64-NEXT:    sd $20, 24($18)
+; SOFT-FLOAT-64-NEXT:    sd $19, 16($18)
+; SOFT-FLOAT-64-NEXT:    sd $3, 8($18)
+; SOFT-FLOAT-64-NEXT:    sd $2, 0($18)
+; SOFT-FLOAT-64-NEXT:    ld $16, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $17, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $18, 32($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $19, 40($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $20, 48($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 56($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 64
+  %result = call { <2 x fp128>, <2 x i32> } @llvm.frexp.v2fp128.v2i32(<2 x fp128> %a)
+  ret { <2 x fp128>, <2 x i32> } %result
+}
+
+declare { half, i32 } @llvm.frexp.f16.i32(half) #0
+declare { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half>) #0
+
+declare { float, i32 } @llvm.frexp.f32.i32(float) #0
+declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>) #0
+
+declare { double, i32 } @llvm.frexp.f64.i32(double) #0
+declare { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double>) #0
+
+declare { fp128, i32 } @llvm.frexp.fp128.i32(fp128) #0
+declare { <2 x fp128>, <2 x i32> } @llvm.frexp.v2fp128.v2i32(<2 x fp128>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/Mips/llvm.sincos.ll b/llvm/test/CodeGen/Mips/llvm.sincos.ll
new file mode 100644
index 0000000..046be12
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm.sincos.ll
@@ -0,0 +1,1044 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=mipsel < %s | FileCheck -check-prefix=MIPSEL %s
+; RUN: llc -mtriple=mips < %s | FileCheck %s -check-prefixes=SOFT-FLOAT-32
+; RUN: llc -mtriple=mips64 < %s | FileCheck %s -check-prefixes=SOFT-FLOAT-64
+
+define { half, half } @test_sincos_f16(half %a) #0 {
+; MIPSEL-LABEL: test_sincos_f16:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 24($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    nop
+; MIPSEL-NEXT:    addiu $5, $sp, 20
+; MIPSEL-NEXT:    addiu $6, $sp, 16
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    lwc1 $f12, 20($sp)
+; MIPSEL-NEXT:    move $16, $2
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    lwc1 $f12, 16($sp)
+; MIPSEL-NEXT:    move $3, $2
+; MIPSEL-NEXT:    move $2, $16
+; MIPSEL-NEXT:    lw $16, 24($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_f16:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 24($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    nop
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 20
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 16
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 20($sp)
+; SOFT-FLOAT-32-NEXT:    move $16, $2
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 16($sp)
+; SOFT-FLOAT-32-NEXT:    move $3, $2
+; SOFT-FLOAT-32-NEXT:    move $2, $16
+; SOFT-FLOAT-32-NEXT:    lw $16, 24($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_f16:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    sll $4, $4, 0
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 12
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 12($sp)
+; SOFT-FLOAT-64-NEXT:    move $16, $2
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 8($sp)
+; SOFT-FLOAT-64-NEXT:    move $3, $2
+; SOFT-FLOAT-64-NEXT:    move $2, $16
+; SOFT-FLOAT-64-NEXT:    ld $16, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 32
+  %result = call { half, half } @llvm.sincos.f16(half %a)
+  ret { half, half } %result
+}
+
+define half @test_sincos_f16_only_use_sin(half %a) #0 {
+; MIPSEL-LABEL: test_sincos_f16_only_use_sin:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    nop
+; MIPSEL-NEXT:    addiu $5, $sp, 24
+; MIPSEL-NEXT:    addiu $6, $sp, 20
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    lwc1 $f12, 24($sp)
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_f16_only_use_sin:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    nop
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 24
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 20
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 24($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_f16_only_use_sin:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    sll $4, $4, 0
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 0
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 4($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+  %result = call { half, half } @llvm.sincos.f16(half %a)
+  %result.0 = extractvalue { half, half } %result, 0
+  ret half %result.0
+}
+
+define half @test_sincos_f16_only_use_cos(half %a) #0 {
+; MIPSEL-LABEL: test_sincos_f16_only_use_cos:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    nop
+; MIPSEL-NEXT:    addiu $5, $sp, 24
+; MIPSEL-NEXT:    addiu $6, $sp, 20
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    lwc1 $f12, 20($sp)
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_f16_only_use_cos:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    nop
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 24
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 20
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 20($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_f16_only_use_cos:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    sll $4, $4, 0
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 0
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 0($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+  %result = call { half, half } @llvm.sincos.f16(half %a)
+  %result.1 = extractvalue { half, half } %result, 1
+  ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) #0 {
+; MIPSEL-LABEL: test_sincos_v2f16:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -48
+; MIPSEL-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $17, 40($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $5
+; MIPSEL-NEXT:    move $17, $4
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    move $4, $5
+; MIPSEL-NEXT:    addiu $5, $sp, 24
+; MIPSEL-NEXT:    addiu $6, $sp, 20
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    jal __extendhfsf2
+; MIPSEL-NEXT:    srl $4, $16, 16
+; MIPSEL-NEXT:    addiu $5, $sp, 32
+; MIPSEL-NEXT:    addiu $6, $sp, 28
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    mov.s $f12, $f0
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    lwc1 $f12, 20($sp)
+; MIPSEL-NEXT:    lwc1 $f12, 24($sp)
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    sh $2, 4($17)
+; MIPSEL-NEXT:    sh $2, 0($17)
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    lwc1 $f12, 28($sp)
+; MIPSEL-NEXT:    sh $2, 6($17)
+; MIPSEL-NEXT:    jal __truncsfhf2
+; MIPSEL-NEXT:    lwc1 $f12, 32($sp)
+; MIPSEL-NEXT:    sh $2, 2($17)
+; MIPSEL-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $17, 40($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 48
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_v2f16:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -48
+; SOFT-FLOAT-32-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $17, 40($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 36($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $5
+; SOFT-FLOAT-32-NEXT:    move $17, $4
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    move $4, $5
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 24
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 20
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-32-NEXT:    srl $4, $16, 16
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 32
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 28
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 20($sp)
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 24($sp)
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    sh $2, 6($17)
+; SOFT-FLOAT-32-NEXT:    sh $2, 2($17)
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 28($sp)
+; SOFT-FLOAT-32-NEXT:    sh $2, 4($17)
+; SOFT-FLOAT-32-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-32-NEXT:    lwc1 $f12, 32($sp)
+; SOFT-FLOAT-32-NEXT:    sh $2, 0($17)
+; SOFT-FLOAT-32-NEXT:    lw $16, 36($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $17, 40($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 48
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_v2f16:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -48
+; SOFT-FLOAT-64-NEXT:    sd $ra, 40($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $17, 32($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $17, $5
+; SOFT-FLOAT-64-NEXT:    move $16, $4
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    sll $4, $5, 0
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 12
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    sll $1, $17, 0
+; SOFT-FLOAT-64-NEXT:    jal __extendhfsf2
+; SOFT-FLOAT-64-NEXT:    srl $4, $1, 16
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 20
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 16
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f0
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 8($sp)
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 12($sp)
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    sh $2, 6($16)
+; SOFT-FLOAT-64-NEXT:    sh $2, 2($16)
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 16($sp)
+; SOFT-FLOAT-64-NEXT:    sh $2, 4($16)
+; SOFT-FLOAT-64-NEXT:    jal __truncsfhf2
+; SOFT-FLOAT-64-NEXT:    lwc1 $f12, 20($sp)
+; SOFT-FLOAT-64-NEXT:    sh $2, 0($16)
+; SOFT-FLOAT-64-NEXT:    ld $16, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $17, 32($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 40($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 48
+  %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a)
+  ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincos_f32(float %a) #0 {
+; MIPSEL-LABEL: test_sincos_f32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    addiu $5, $sp, 24
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    addiu $6, $sp, 20
+; MIPSEL-NEXT:    lwc1 $f0, 24($sp)
+; MIPSEL-NEXT:    lwc1 $f2, 20($sp)
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_f32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    addiu $5, $sp, 24
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 20
+; SOFT-FLOAT-32-NEXT:    lwc1 $f0, 24($sp)
+; SOFT-FLOAT-32-NEXT:    lwc1 $f2, 20($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_f32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -16
+; SOFT-FLOAT-64-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 0
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 4($sp)
+; SOFT-FLOAT-64-NEXT:    lwc1 $f2, 0($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 16
+  %result = call { float, float } @llvm.sincos.f32(float %a)
+  ret { float, float } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
+; MIPSEL-LABEL: test_sincos_v2f32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $6
+; MIPSEL-NEXT:    move $17, $4
+; MIPSEL-NEXT:    mtc1 $7, $f12
+; MIPSEL-NEXT:    addiu $5, $4, 4
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    addiu $6, $4, 12
+; MIPSEL-NEXT:    mtc1 $16, $f12
+; MIPSEL-NEXT:    addiu $6, $17, 8
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    move $5, $17
+; MIPSEL-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_v2f32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $17, 24($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $6
+; SOFT-FLOAT-32-NEXT:    move $17, $4
+; SOFT-FLOAT-32-NEXT:    mtc1 $7, $f12
+; SOFT-FLOAT-32-NEXT:    addiu $5, $4, 4
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    addiu $6, $4, 12
+; SOFT-FLOAT-32-NEXT:    mtc1 $16, $f12
+; SOFT-FLOAT-32-NEXT:    addiu $6, $17, 8
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    move $5, $17
+; SOFT-FLOAT-32-NEXT:    lw $16, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $17, 24($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_v2f32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $16, $4
+; SOFT-FLOAT-64-NEXT:    dsrl $1, $4, 32
+; SOFT-FLOAT-64-NEXT:    sll $1, $1, 0
+; SOFT-FLOAT-64-NEXT:    mtc1 $1, $f12
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 12
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64-NEXT:    sll $1, $16, 0
+; SOFT-FLOAT-64-NEXT:    mtc1 $1, $f12
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 0
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 12($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $1, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 8($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $3, $f0
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 4($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $2, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $2, $2, 32
+; SOFT-FLOAT-64-NEXT:    dsrl $2, $2, 32
+; SOFT-FLOAT-64-NEXT:    or $2, $2, $1
+; SOFT-FLOAT-64-NEXT:    dsll $1, $3, 32
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 0($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $3, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $3, $3, 32
+; SOFT-FLOAT-64-NEXT:    dsrl $3, $3, 32
+; SOFT-FLOAT-64-NEXT:    or $3, $3, $1
+; SOFT-FLOAT-64-NEXT:    ld $16, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 32
+; SOFT-FLOAT-64R2-LABEL: test_sincos_v2f32:
+; SOFT-FLOAT-64R2:       # %bb.0:
+; SOFT-FLOAT-64R2-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64R2-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64R2-NEXT:    sd $16, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64R2-NEXT:    move $16, $4
+; SOFT-FLOAT-64R2-NEXT:    dsrl $1, $4, 32
+; SOFT-FLOAT-64R2-NEXT:    sll $1, $1, 0
+; SOFT-FLOAT-64R2-NEXT:    mtc1 $1, $f12
+; SOFT-FLOAT-64R2-NEXT:    daddiu $5, $sp, 12
+; SOFT-FLOAT-64R2-NEXT:    jal sincosf
+; SOFT-FLOAT-64R2-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64R2-NEXT:    sll $1, $16, 0
+; SOFT-FLOAT-64R2-NEXT:    mtc1 $1, $f12
+; SOFT-FLOAT-64R2-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64R2-NEXT:    jal sincosf
+; SOFT-FLOAT-64R2-NEXT:    daddiu $6, $sp, 0
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 12($sp)
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $1, $f0
+; SOFT-FLOAT-64R2-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 4($sp)
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $2, $f0
+; SOFT-FLOAT-64R2-NEXT:    dext $2, $2, 0, 32
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 8($sp)
+; SOFT-FLOAT-64R2-NEXT:    or $2, $2, $1
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $1, $f0
+; SOFT-FLOAT-64R2-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 0($sp)
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $3, $f0
+; SOFT-FLOAT-64R2-NEXT:    dext $3, $3, 0, 32
+; SOFT-FLOAT-64R2-NEXT:    or $3, $3, $1
+; SOFT-FLOAT-64R2-NEXT:    ld $16, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64R2-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64R2-NEXT:    jr $ra
+; SOFT-FLOAT-64R2-NEXT:    daddiu $sp, $sp, 32
+  %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
+  ret { <2 x float>, <2 x float> } %result
+}
+
+define { <3 x float>, <3 x float> } @test_sincos_v3f32(<3 x float> %a) #0 {
+; MIPSEL-LABEL: test_sincos_v3f32:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -32
+; MIPSEL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $18, 24($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $17, 20($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $6
+; MIPSEL-NEXT:    move $17, $5
+; MIPSEL-NEXT:    move $18, $4
+; MIPSEL-NEXT:    mtc1 $7, $f12
+; MIPSEL-NEXT:    addiu $5, $4, 8
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    addiu $6, $4, 24
+; MIPSEL-NEXT:    mtc1 $16, $f12
+; MIPSEL-NEXT:    addiu $5, $18, 4
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    addiu $6, $18, 20
+; MIPSEL-NEXT:    mtc1 $17, $f12
+; MIPSEL-NEXT:    addiu $6, $18, 16
+; MIPSEL-NEXT:    jal sincosf
+; MIPSEL-NEXT:    move $5, $18
+; MIPSEL-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $17, 20($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $18, 24($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_v3f32:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $18, 24($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $17, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $6
+; SOFT-FLOAT-32-NEXT:    move $17, $5
+; SOFT-FLOAT-32-NEXT:    move $18, $4
+; SOFT-FLOAT-32-NEXT:    mtc1 $7, $f12
+; SOFT-FLOAT-32-NEXT:    addiu $5, $4, 8
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    addiu $6, $4, 24
+; SOFT-FLOAT-32-NEXT:    mtc1 $16, $f12
+; SOFT-FLOAT-32-NEXT:    addiu $5, $18, 4
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    addiu $6, $18, 20
+; SOFT-FLOAT-32-NEXT:    mtc1 $17, $f12
+; SOFT-FLOAT-32-NEXT:    addiu $6, $18, 16
+; SOFT-FLOAT-32-NEXT:    jal sincosf
+; SOFT-FLOAT-32-NEXT:    move $5, $18
+; SOFT-FLOAT-32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $17, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $18, 24($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 32
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_v3f32:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -48
+; SOFT-FLOAT-64-NEXT:    sdc1 $f25, 40($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sdc1 $f24, 32($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    mov.s $f24, $f15
+; SOFT-FLOAT-64-NEXT:    mov.s $f25, $f14
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f13
+; SOFT-FLOAT-64-NEXT:    move $16, $4
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 0
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 12
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f25
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $16, 8
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $16, 24
+; SOFT-FLOAT-64-NEXT:    jal sincosf
+; SOFT-FLOAT-64-NEXT:    mov.s $f12, $f24
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 0($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $1, $f0
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 4($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $2, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 8($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $3, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $3, $3, 32
+; SOFT-FLOAT-64-NEXT:    dsrl $3, $3, 32
+; SOFT-FLOAT-64-NEXT:    or $1, $3, $1
+; SOFT-FLOAT-64-NEXT:    sd $1, 16($16)
+; SOFT-FLOAT-64-NEXT:    dsll $1, $2, 32
+; SOFT-FLOAT-64-NEXT:    lwc1 $f0, 12($sp)
+; SOFT-FLOAT-64-NEXT:    mfc1 $2, $f0
+; SOFT-FLOAT-64-NEXT:    dsll $2, $2, 32
+; SOFT-FLOAT-64-NEXT:    dsrl $2, $2, 32
+; SOFT-FLOAT-64-NEXT:    or $1, $2, $1
+; SOFT-FLOAT-64-NEXT:    sd $1, 0($16)
+; SOFT-FLOAT-64-NEXT:    ld $16, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ldc1 $f24, 32($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ldc1 $f25, 40($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 48
+; SOFT-FLOAT-64R2-LABEL: test_sincos_v3f32:
+; SOFT-FLOAT-64R2:       # %bb.0:
+; SOFT-FLOAT-64R2-NEXT:    daddiu $sp, $sp, -48
+; SOFT-FLOAT-64R2-NEXT:    sdc1 $f25, 40($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64R2-NEXT:    sdc1 $f24, 32($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64R2-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64R2-NEXT:    sd $16, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64R2-NEXT:    mov.s $f24, $f15
+; SOFT-FLOAT-64R2-NEXT:    mov.s $f25, $f14
+; SOFT-FLOAT-64R2-NEXT:    mov.s $f12, $f13
+; SOFT-FLOAT-64R2-NEXT:    move $16, $4
+; SOFT-FLOAT-64R2-NEXT:    daddiu $5, $sp, 4
+; SOFT-FLOAT-64R2-NEXT:    jal sincosf
+; SOFT-FLOAT-64R2-NEXT:    daddiu $6, $sp, 0
+; SOFT-FLOAT-64R2-NEXT:    daddiu $5, $sp, 12
+; SOFT-FLOAT-64R2-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64R2-NEXT:    jal sincosf
+; SOFT-FLOAT-64R2-NEXT:    mov.s $f12, $f25
+; SOFT-FLOAT-64R2-NEXT:    daddiu $5, $16, 8
+; SOFT-FLOAT-64R2-NEXT:    daddiu $6, $16, 24
+; SOFT-FLOAT-64R2-NEXT:    jal sincosf
+; SOFT-FLOAT-64R2-NEXT:    mov.s $f12, $f24
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 0($sp)
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $1, $f0
+; SOFT-FLOAT-64R2-NEXT:    dsll $1, $1, 32
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 8($sp)
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $2, $f0
+; SOFT-FLOAT-64R2-NEXT:    dext $2, $2, 0, 32
+; SOFT-FLOAT-64R2-NEXT:    or $1, $2, $1
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 4($sp)
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $2, $f0
+; SOFT-FLOAT-64R2-NEXT:    sd $1, 16($16)
+; SOFT-FLOAT-64R2-NEXT:    dsll $1, $2, 32
+; SOFT-FLOAT-64R2-NEXT:    lwc1 $f0, 12($sp)
+; SOFT-FLOAT-64R2-NEXT:    mfc1 $2, $f0
+; SOFT-FLOAT-64R2-NEXT:    dext $2, $2, 0, 32
+; SOFT-FLOAT-64R2-NEXT:    or $1, $2, $1
+; SOFT-FLOAT-64R2-NEXT:    sd $1, 0($16)
+; SOFT-FLOAT-64R2-NEXT:    ld $16, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64R2-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64R2-NEXT:    ldc1 $f24, 32($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64R2-NEXT:    ldc1 $f25, 40($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64R2-NEXT:    jr $ra
+; SOFT-FLOAT-64R2-NEXT:    daddiu $sp, $sp, 48
+  %result = call { <3 x float>, <3 x float> } @llvm.sincos.v3f32(<3 x float> %a)
+  ret { <3 x float>, <3 x float> } %result
+}
+
+define { double, double } @test_sincos_f64(double %a) #0 {
+; MIPSEL-LABEL: test_sincos_f64:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -40
+; MIPSEL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    addiu $6, $sp, 24
+; MIPSEL-NEXT:    jal sincos
+; MIPSEL-NEXT:    addiu $7, $sp, 16
+; MIPSEL-NEXT:    ldc1 $f0, 24($sp)
+; MIPSEL-NEXT:    ldc1 $f2, 16($sp)
+; MIPSEL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 40
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_f64:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -40
+; SOFT-FLOAT-32-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    addiu $6, $sp, 24
+; SOFT-FLOAT-32-NEXT:    jal sincos
+; SOFT-FLOAT-32-NEXT:    addiu $7, $sp, 16
+; SOFT-FLOAT-32-NEXT:    ldc1 $f0, 24($sp)
+; SOFT-FLOAT-32-NEXT:    ldc1 $f2, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 40
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_f64:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $sp, 16
+; SOFT-FLOAT-64-NEXT:    jal sincos
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 8
+; SOFT-FLOAT-64-NEXT:    ldc1 $f0, 16($sp)
+; SOFT-FLOAT-64-NEXT:    ldc1 $f2, 8($sp)
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 32
+  %result = call { double, double } @llvm.sincos.f64(double %a)
+  ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
+; MIPSEL-LABEL: test_sincos_v2f64:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -40
+; MIPSEL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $16, $4
+; MIPSEL-NEXT:    lw $1, 56($sp)
+; MIPSEL-NEXT:    lw $2, 60($sp)
+; MIPSEL-NEXT:    sw $2, 28($sp)
+; MIPSEL-NEXT:    sw $1, 24($sp)
+; MIPSEL-NEXT:    sw $7, 20($sp)
+; MIPSEL-NEXT:    sw $6, 16($sp)
+; MIPSEL-NEXT:    addiu $6, $4, 8
+; MIPSEL-NEXT:    addiu $7, $4, 24
+; MIPSEL-NEXT:    jal sincos
+; MIPSEL-NEXT:    ldc1 $f12, 24($sp)
+; MIPSEL-NEXT:    addiu $7, $16, 16
+; MIPSEL-NEXT:    ldc1 $f12, 16($sp)
+; MIPSEL-NEXT:    jal sincos
+; MIPSEL-NEXT:    move $6, $16
+; MIPSEL-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 40
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_v2f64:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -40
+; SOFT-FLOAT-32-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 32($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $16, $4
+; SOFT-FLOAT-32-NEXT:    lw $1, 56($sp)
+; SOFT-FLOAT-32-NEXT:    lw $2, 60($sp)
+; SOFT-FLOAT-32-NEXT:    sw $2, 28($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 24($sp)
+; SOFT-FLOAT-32-NEXT:    sw $7, 20($sp)
+; SOFT-FLOAT-32-NEXT:    sw $6, 16($sp)
+; SOFT-FLOAT-32-NEXT:    addiu $6, $4, 8
+; SOFT-FLOAT-32-NEXT:    addiu $7, $4, 24
+; SOFT-FLOAT-32-NEXT:    jal sincos
+; SOFT-FLOAT-32-NEXT:    ldc1 $f12, 24($sp)
+; SOFT-FLOAT-32-NEXT:    addiu $7, $16, 16
+; SOFT-FLOAT-32-NEXT:    ldc1 $f12, 16($sp)
+; SOFT-FLOAT-32-NEXT:    jal sincos
+; SOFT-FLOAT-32-NEXT:    move $6, $16
+; SOFT-FLOAT-32-NEXT:    lw $16, 32($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 40
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_v2f64:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -32
+; SOFT-FLOAT-64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $17, 16($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $16, $5
+; SOFT-FLOAT-64-NEXT:    move $17, $4
+; SOFT-FLOAT-64-NEXT:    dmtc1 $6, $f12
+; SOFT-FLOAT-64-NEXT:    daddiu $5, $4, 8
+; SOFT-FLOAT-64-NEXT:    jal sincos
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $4, 24
+; SOFT-FLOAT-64-NEXT:    dmtc1 $16, $f12
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $17, 16
+; SOFT-FLOAT-64-NEXT:    jal sincos
+; SOFT-FLOAT-64-NEXT:    move $5, $17
+; SOFT-FLOAT-64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $17, 16($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 32
+; SOFT-FLOAT-32R2-LABEL: test_sincos_v2f64:
+; SOFT-FLOAT-32R2:       # %bb.0:
+; SOFT-FLOAT-32R2-NEXT:    addiu $sp, $sp, -32
+; SOFT-FLOAT-32R2-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32R2-NEXT:    sw $18, 24($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32R2-NEXT:    sw $17, 20($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32R2-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32R2-NEXT:    move $16, $7
+; SOFT-FLOAT-32R2-NEXT:    move $17, $6
+; SOFT-FLOAT-32R2-NEXT:    move $18, $4
+; SOFT-FLOAT-32R2-NEXT:    lw $1, 48($sp)
+; SOFT-FLOAT-32R2-NEXT:    lw $2, 52($sp)
+; SOFT-FLOAT-32R2-NEXT:    mtc1 $2, $f12
+; SOFT-FLOAT-32R2-NEXT:    mthc1 $1, $f12
+; SOFT-FLOAT-32R2-NEXT:    addiu $6, $4, 8
+; SOFT-FLOAT-32R2-NEXT:    jal sincos
+; SOFT-FLOAT-32R2-NEXT:    addiu $7, $4, 24
+; SOFT-FLOAT-32R2-NEXT:    mtc1 $16, $f12
+; SOFT-FLOAT-32R2-NEXT:    mthc1 $17, $f12
+; SOFT-FLOAT-32R2-NEXT:    addiu $7, $18, 16
+; SOFT-FLOAT-32R2-NEXT:    jal sincos
+; SOFT-FLOAT-32R2-NEXT:    move $6, $18
+; SOFT-FLOAT-32R2-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32R2-NEXT:    lw $17, 20($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32R2-NEXT:    lw $18, 24($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32R2-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32R2-NEXT:    jr $ra
+; SOFT-FLOAT-32R2-NEXT:    addiu $sp, $sp, 32
+  %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
+
+define { fp128, fp128 } @test_sincos_f128(fp128 %a) #0 {
+; MIPSEL-LABEL: test_sincos_f128:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -64
+; MIPSEL-NEXT:    sw $ra, 60($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 56($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $1, $7
+; MIPSEL-NEXT:    move $16, $4
+; MIPSEL-NEXT:    addiu $2, $sp, 24
+; MIPSEL-NEXT:    sw $2, 20($sp)
+; MIPSEL-NEXT:    addiu $2, $sp, 40
+; MIPSEL-NEXT:    sw $2, 16($sp)
+; MIPSEL-NEXT:    lw $7, 80($sp)
+; MIPSEL-NEXT:    move $4, $5
+; MIPSEL-NEXT:    move $5, $6
+; MIPSEL-NEXT:    jal sincosl
+; MIPSEL-NEXT:    move $6, $1
+; MIPSEL-NEXT:    lw $1, 52($sp)
+; MIPSEL-NEXT:    lw $2, 24($sp)
+; MIPSEL-NEXT:    lw $3, 28($sp)
+; MIPSEL-NEXT:    lw $4, 32($sp)
+; MIPSEL-NEXT:    lw $5, 36($sp)
+; MIPSEL-NEXT:    sw $5, 28($16)
+; MIPSEL-NEXT:    sw $4, 24($16)
+; MIPSEL-NEXT:    sw $3, 20($16)
+; MIPSEL-NEXT:    sw $2, 16($16)
+; MIPSEL-NEXT:    sw $1, 12($16)
+; MIPSEL-NEXT:    lw $1, 48($sp)
+; MIPSEL-NEXT:    sw $1, 8($16)
+; MIPSEL-NEXT:    lw $1, 44($sp)
+; MIPSEL-NEXT:    sw $1, 4($16)
+; MIPSEL-NEXT:    lw $1, 40($sp)
+; MIPSEL-NEXT:    sw $1, 0($16)
+; MIPSEL-NEXT:    lw $16, 56($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 60($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 64
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_f128:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -64
+; SOFT-FLOAT-32-NEXT:    sw $ra, 60($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 56($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $1, $7
+; SOFT-FLOAT-32-NEXT:    move $16, $4
+; SOFT-FLOAT-32-NEXT:    addiu $2, $sp, 24
+; SOFT-FLOAT-32-NEXT:    sw $2, 20($sp)
+; SOFT-FLOAT-32-NEXT:    addiu $2, $sp, 40
+; SOFT-FLOAT-32-NEXT:    sw $2, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $7, 80($sp)
+; SOFT-FLOAT-32-NEXT:    move $4, $5
+; SOFT-FLOAT-32-NEXT:    move $5, $6
+; SOFT-FLOAT-32-NEXT:    jal sincosl
+; SOFT-FLOAT-32-NEXT:    move $6, $1
+; SOFT-FLOAT-32-NEXT:    lw $1, 52($sp)
+; SOFT-FLOAT-32-NEXT:    lw $2, 24($sp)
+; SOFT-FLOAT-32-NEXT:    lw $3, 28($sp)
+; SOFT-FLOAT-32-NEXT:    lw $4, 32($sp)
+; SOFT-FLOAT-32-NEXT:    lw $5, 36($sp)
+; SOFT-FLOAT-32-NEXT:    sw $5, 28($16)
+; SOFT-FLOAT-32-NEXT:    sw $4, 24($16)
+; SOFT-FLOAT-32-NEXT:    sw $3, 20($16)
+; SOFT-FLOAT-32-NEXT:    sw $2, 16($16)
+; SOFT-FLOAT-32-NEXT:    sw $1, 12($16)
+; SOFT-FLOAT-32-NEXT:    lw $1, 48($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 8($16)
+; SOFT-FLOAT-32-NEXT:    lw $1, 44($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 4($16)
+; SOFT-FLOAT-32-NEXT:    lw $1, 40($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 0($16)
+; SOFT-FLOAT-32-NEXT:    lw $16, 56($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 60($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 64
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_f128:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -48
+; SOFT-FLOAT-64-NEXT:    sd $ra, 40($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 32($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $16, $4
+; SOFT-FLOAT-64-NEXT:    dmfc1 $4, $f13
+; SOFT-FLOAT-64-NEXT:    dmfc1 $5, $f14
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 16
+; SOFT-FLOAT-64-NEXT:    jal sincosl
+; SOFT-FLOAT-64-NEXT:    daddiu $7, $sp, 0
+; SOFT-FLOAT-64-NEXT:    ld $1, 8($sp)
+; SOFT-FLOAT-64-NEXT:    sd $1, 24($16)
+; SOFT-FLOAT-64-NEXT:    ld $1, 0($sp)
+; SOFT-FLOAT-64-NEXT:    sd $1, 16($16)
+; SOFT-FLOAT-64-NEXT:    ld $1, 24($sp)
+; SOFT-FLOAT-64-NEXT:    sd $1, 8($16)
+; SOFT-FLOAT-64-NEXT:    ld $1, 16($sp)
+; SOFT-FLOAT-64-NEXT:    sd $1, 0($16)
+; SOFT-FLOAT-64-NEXT:    ld $16, 32($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 40($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 48
+  %result = call { fp128, fp128 } @llvm.sincos.f128(fp128 %a)
+  ret { fp128, fp128 } %result
+}
+
+define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
+; MIPSEL-LABEL: test_sincos_v2f128:
+; MIPSEL:       # %bb.0:
+; MIPSEL-NEXT:    addiu $sp, $sp, -96
+; MIPSEL-NEXT:    sw $ra, 92($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    sw $16, 88($sp) # 4-byte Folded Spill
+; MIPSEL-NEXT:    move $5, $7
+; MIPSEL-NEXT:    move $1, $6
+; MIPSEL-NEXT:    move $16, $4
+; MIPSEL-NEXT:    addiu $2, $sp, 24
+; MIPSEL-NEXT:    sw $2, 20($sp)
+; MIPSEL-NEXT:    addiu $2, $sp, 40
+; MIPSEL-NEXT:    sw $2, 16($sp)
+; MIPSEL-NEXT:    lw $6, 112($sp)
+; MIPSEL-NEXT:    lw $7, 116($sp)
+; MIPSEL-NEXT:    jal sincosl
+; MIPSEL-NEXT:    move $4, $1
+; MIPSEL-NEXT:    addiu $1, $sp, 56
+; MIPSEL-NEXT:    sw $1, 20($sp)
+; MIPSEL-NEXT:    addiu $1, $sp, 72
+; MIPSEL-NEXT:    sw $1, 16($sp)
+; MIPSEL-NEXT:    lw $4, 120($sp)
+; MIPSEL-NEXT:    lw $5, 124($sp)
+; MIPSEL-NEXT:    lw $6, 128($sp)
+; MIPSEL-NEXT:    lw $7, 132($sp)
+; MIPSEL-NEXT:    jal sincosl
+; MIPSEL-NEXT:    nop
+; MIPSEL-NEXT:    lw $1, 36($sp)
+; MIPSEL-NEXT:    lw $2, 56($sp)
+; MIPSEL-NEXT:    lw $3, 60($sp)
+; MIPSEL-NEXT:    lw $4, 64($sp)
+; MIPSEL-NEXT:    lw $5, 52($sp)
+; MIPSEL-NEXT:    lw $6, 72($sp)
+; MIPSEL-NEXT:    lw $7, 76($sp)
+; MIPSEL-NEXT:    lw $8, 80($sp)
+; MIPSEL-NEXT:    lw $9, 84($sp)
+; MIPSEL-NEXT:    lw $10, 24($sp)
+; MIPSEL-NEXT:    lw $11, 28($sp)
+; MIPSEL-NEXT:    lw $12, 32($sp)
+; MIPSEL-NEXT:    lw $13, 68($sp)
+; MIPSEL-NEXT:    sw $13, 60($16)
+; MIPSEL-NEXT:    sw $4, 56($16)
+; MIPSEL-NEXT:    sw $3, 52($16)
+; MIPSEL-NEXT:    sw $2, 48($16)
+; MIPSEL-NEXT:    sw $1, 44($16)
+; MIPSEL-NEXT:    sw $12, 40($16)
+; MIPSEL-NEXT:    sw $11, 36($16)
+; MIPSEL-NEXT:    sw $10, 32($16)
+; MIPSEL-NEXT:    sw $9, 28($16)
+; MIPSEL-NEXT:    sw $8, 24($16)
+; MIPSEL-NEXT:    sw $7, 20($16)
+; MIPSEL-NEXT:    sw $6, 16($16)
+; MIPSEL-NEXT:    sw $5, 12($16)
+; MIPSEL-NEXT:    lw $1, 48($sp)
+; MIPSEL-NEXT:    sw $1, 8($16)
+; MIPSEL-NEXT:    lw $1, 44($sp)
+; MIPSEL-NEXT:    sw $1, 4($16)
+; MIPSEL-NEXT:    lw $1, 40($sp)
+; MIPSEL-NEXT:    sw $1, 0($16)
+; MIPSEL-NEXT:    lw $16, 88($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    lw $ra, 92($sp) # 4-byte Folded Reload
+; MIPSEL-NEXT:    jr $ra
+; MIPSEL-NEXT:    addiu $sp, $sp, 96
+;
+; SOFT-FLOAT-32-LABEL: test_sincos_v2f128:
+; SOFT-FLOAT-32:       # %bb.0:
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, -96
+; SOFT-FLOAT-32-NEXT:    sw $ra, 92($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    sw $16, 88($sp) # 4-byte Folded Spill
+; SOFT-FLOAT-32-NEXT:    move $5, $7
+; SOFT-FLOAT-32-NEXT:    move $1, $6
+; SOFT-FLOAT-32-NEXT:    move $16, $4
+; SOFT-FLOAT-32-NEXT:    addiu $2, $sp, 24
+; SOFT-FLOAT-32-NEXT:    sw $2, 20($sp)
+; SOFT-FLOAT-32-NEXT:    addiu $2, $sp, 40
+; SOFT-FLOAT-32-NEXT:    sw $2, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $6, 112($sp)
+; SOFT-FLOAT-32-NEXT:    lw $7, 116($sp)
+; SOFT-FLOAT-32-NEXT:    jal sincosl
+; SOFT-FLOAT-32-NEXT:    move $4, $1
+; SOFT-FLOAT-32-NEXT:    addiu $1, $sp, 56
+; SOFT-FLOAT-32-NEXT:    sw $1, 20($sp)
+; SOFT-FLOAT-32-NEXT:    addiu $1, $sp, 72
+; SOFT-FLOAT-32-NEXT:    sw $1, 16($sp)
+; SOFT-FLOAT-32-NEXT:    lw $4, 120($sp)
+; SOFT-FLOAT-32-NEXT:    lw $5, 124($sp)
+; SOFT-FLOAT-32-NEXT:    lw $6, 128($sp)
+; SOFT-FLOAT-32-NEXT:    lw $7, 132($sp)
+; SOFT-FLOAT-32-NEXT:    jal sincosl
+; SOFT-FLOAT-32-NEXT:    nop
+; SOFT-FLOAT-32-NEXT:    lw $1, 36($sp)
+; SOFT-FLOAT-32-NEXT:    lw $2, 56($sp)
+; SOFT-FLOAT-32-NEXT:    lw $3, 60($sp)
+; SOFT-FLOAT-32-NEXT:    lw $4, 64($sp)
+; SOFT-FLOAT-32-NEXT:    lw $5, 52($sp)
+; SOFT-FLOAT-32-NEXT:    lw $6, 72($sp)
+; SOFT-FLOAT-32-NEXT:    lw $7, 76($sp)
+; SOFT-FLOAT-32-NEXT:    lw $8, 80($sp)
+; SOFT-FLOAT-32-NEXT:    lw $9, 84($sp)
+; SOFT-FLOAT-32-NEXT:    lw $10, 24($sp)
+; SOFT-FLOAT-32-NEXT:    lw $11, 28($sp)
+; SOFT-FLOAT-32-NEXT:    lw $12, 32($sp)
+; SOFT-FLOAT-32-NEXT:    lw $13, 68($sp)
+; SOFT-FLOAT-32-NEXT:    sw $13, 60($16)
+; SOFT-FLOAT-32-NEXT:    sw $4, 56($16)
+; SOFT-FLOAT-32-NEXT:    sw $3, 52($16)
+; SOFT-FLOAT-32-NEXT:    sw $2, 48($16)
+; SOFT-FLOAT-32-NEXT:    sw $1, 44($16)
+; SOFT-FLOAT-32-NEXT:    sw $12, 40($16)
+; SOFT-FLOAT-32-NEXT:    sw $11, 36($16)
+; SOFT-FLOAT-32-NEXT:    sw $10, 32($16)
+; SOFT-FLOAT-32-NEXT:    sw $9, 28($16)
+; SOFT-FLOAT-32-NEXT:    sw $8, 24($16)
+; SOFT-FLOAT-32-NEXT:    sw $7, 20($16)
+; SOFT-FLOAT-32-NEXT:    sw $6, 16($16)
+; SOFT-FLOAT-32-NEXT:    sw $5, 12($16)
+; SOFT-FLOAT-32-NEXT:    lw $1, 48($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 8($16)
+; SOFT-FLOAT-32-NEXT:    lw $1, 44($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 4($16)
+; SOFT-FLOAT-32-NEXT:    lw $1, 40($sp)
+; SOFT-FLOAT-32-NEXT:    sw $1, 0($16)
+; SOFT-FLOAT-32-NEXT:    lw $16, 88($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    lw $ra, 92($sp) # 4-byte Folded Reload
+; SOFT-FLOAT-32-NEXT:    jr $ra
+; SOFT-FLOAT-32-NEXT:    addiu $sp, $sp, 96
+;
+; SOFT-FLOAT-64-LABEL: test_sincos_v2f128:
+; SOFT-FLOAT-64:       # %bb.0:
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, -96
+; SOFT-FLOAT-64-NEXT:    sd $ra, 88($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $18, 80($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $17, 72($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    sd $16, 64($sp) # 8-byte Folded Spill
+; SOFT-FLOAT-64-NEXT:    move $1, $7
+; SOFT-FLOAT-64-NEXT:    move $16, $6
+; SOFT-FLOAT-64-NEXT:    move $17, $5
+; SOFT-FLOAT-64-NEXT:    move $18, $4
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 48
+; SOFT-FLOAT-64-NEXT:    daddiu $7, $sp, 32
+; SOFT-FLOAT-64-NEXT:    move $4, $1
+; SOFT-FLOAT-64-NEXT:    jal sincosl
+; SOFT-FLOAT-64-NEXT:    move $5, $8
+; SOFT-FLOAT-64-NEXT:    daddiu $6, $sp, 16
+; SOFT-FLOAT-64-NEXT:    daddiu $7, $sp, 0
+; SOFT-FLOAT-64-NEXT:    move $4, $17
+; SOFT-FLOAT-64-NEXT:    jal sincosl
+; SOFT-FLOAT-64-NEXT:    move $5, $16
+; SOFT-FLOAT-64-NEXT:    ld $1, 56($sp)
+; SOFT-FLOAT-64-NEXT:    ld $2, 0($sp)
+; SOFT-FLOAT-64-NEXT:    ld $3, 8($sp)
+; SOFT-FLOAT-64-NEXT:    ld $4, 32($sp)
+; SOFT-FLOAT-64-NEXT:    ld $5, 40($sp)
+; SOFT-FLOAT-64-NEXT:    sd $5, 56($18)
+; SOFT-FLOAT-64-NEXT:    sd $4, 48($18)
+; SOFT-FLOAT-64-NEXT:    sd $3, 40($18)
+; SOFT-FLOAT-64-NEXT:    sd $2, 32($18)
+; SOFT-FLOAT-64-NEXT:    sd $1, 24($18)
+; SOFT-FLOAT-64-NEXT:    ld $1, 48($sp)
+; SOFT-FLOAT-64-NEXT:    sd $1, 16($18)
+; SOFT-FLOAT-64-NEXT:    ld $1, 24($sp)
+; SOFT-FLOAT-64-NEXT:    sd $1, 8($18)
+; SOFT-FLOAT-64-NEXT:    ld $1, 16($sp)
+; SOFT-FLOAT-64-NEXT:    sd $1, 0($18)
+; SOFT-FLOAT-64-NEXT:    ld $16, 64($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $17, 72($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $18, 80($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    ld $ra, 88($sp) # 8-byte Folded Reload
+; SOFT-FLOAT-64-NEXT:    jr $ra
+; SOFT-FLOAT-64-NEXT:    daddiu $sp, $sp, 96
+  %result = call { <2 x fp128>, <2 x fp128> } @llvm.sincos.v2f128(<2 x fp128> %a)
+  ret { <2 x fp128>, <2 x fp128> } %result
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Mips/nan_lowering.ll b/llvm/test/CodeGen/Mips/nan_lowering.ll
new file mode 100644
index 0000000..2a11278
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/nan_lowering.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=mips-linux-gnu -mattr=-nan2008 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-linux-gnu -mattr=+nan2008 < %s | FileCheck %s
+
+; Make sure that lowering does not corrupt the value of NaN values,
+; regardless of what the NaN mode is.
+
+define float @test1() {
+; CHECK: .4byte 0x7fc00000
+  ret float bitcast (i32 u0x7fc00000 to float)
+}
+
+define float @test2() {
+; CHECK: .4byte 0x7fc00001
+  ret float bitcast (i32 u0x7fc00001 to float)
+}
+
+define float @test3() {
+; CHECK: .4byte 0x7f800000
+  ret float bitcast (i32 u0x7f800000 to float)
+}
+
+define float @test4() {
+; CHECK: .4byte 0x7f800001
+  ret float bitcast (i32 u0x7f800001 to float)
+}
diff --git a/llvm/test/CodeGen/Mips/qnan.ll b/llvm/test/CodeGen/Mips/qnan.ll
deleted file mode 100644
index e5b4aa1..0000000
--- a/llvm/test/CodeGen/Mips/qnan.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu < %s -o - | FileCheck %s -check-prefixes=MIPS_Legacy
-; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu -mattr=+nan2008 < %s -o - | FileCheck %s -check-prefixes=MIPS_NaN2008
-
-define dso_local float @nan(float noundef %a, float noundef %b) local_unnamed_addr #0 {
-; MIPS_Legacy: $CPI0_0:
-; MIPS_Legacy-NEXT: .4byte  0x7fa00000 # float NaN
-
-; MIPS_NaN2008: $CPI0_0:
-; MIPS_NaN2008-NEXT: .4byte  0x7fc00000 # float NaN
-
-entry:
-  %0 = tail call float @llvm.minimum.f32(float %a, float %b)
-  ret float %0
-}
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index 7f52e52..abc873e 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -16,8 +16,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2f32_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    call.uni (retval0), barv, (param0);
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
 ; CHECK-NEXT:    } // callseq 0
@@ -32,24 +32,24 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 define void @test_v3f32(<3 x float> %input, ptr %output) {
 ; CHECK-LABEL: test_v3f32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<10>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0];
-; CHECK-NEXT:    ld.param.b32 %r3, [test_v3f32_param_0+8];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v3f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_v3f32_param_0+8];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
-; CHECK-NEXT:    st.param.b32 [param0+8], %r3;
 ; CHECK-NEXT:    .param .align 16 .b8 retval0[16];
+; CHECK-NEXT:    st.param.b32 [param0+8], %r1;
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    call.uni (retval0), barv3, (param0);
-; CHECK-NEXT:    ld.param.v2.b32 {%r4, %r5}, [retval0];
-; CHECK-NEXT:    ld.param.b32 %r6, [retval0+8];
+; CHECK-NEXT:    ld.param.b32 %r2, [retval0+8];
+; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
 ; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_v3f32_param_1];
-; CHECK-NEXT:    st.v2.b32 [%rd1], {%r4, %r5};
-; CHECK-NEXT:    st.b32 [%rd1+8], %r6;
+; CHECK-NEXT:    ld.param.b64 %rd4, [test_v3f32_param_1];
+; CHECK-NEXT:    st.b32 [%rd4+8], %r2;
+; CHECK-NEXT:    st.b64 [%rd4], %rd2;
 ; CHECK-NEXT:    ret;
   %call = tail call <3 x float> @barv3(<3 x float> %input)
 ; Make sure we don't load more values than than we need to.
@@ -68,16 +68,16 @@ define void @test_a2f32([2 x float] %input, ptr %output) {
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_a2f32_param_0+4];
 ; CHECK-NEXT:    { // callseq 2, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[8];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    st.param.b32 [param0+4], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b32 [param0+4], %r2;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), bara, (param0);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    ld.param.b32 %r4, [retval0+4];
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0+4];
+; CHECK-NEXT:    ld.param.b32 %r4, [retval0];
 ; CHECK-NEXT:    } // callseq 2
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_a2f32_param_1];
-; CHECK-NEXT:    st.b32 [%rd1+4], %r4;
-; CHECK-NEXT:    st.b32 [%rd1], %r3;
+; CHECK-NEXT:    st.b32 [%rd1+4], %r3;
+; CHECK-NEXT:    st.b32 [%rd1], %r4;
 ; CHECK-NEXT:    ret;
   %call = tail call [2 x float] @bara([2 x float] %input)
   store [2 x float] %call, ptr %output, align 4
@@ -95,16 +95,16 @@ define void @test_s2f32({float, float} %input, ptr %output) {
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_s2f32_param_0+4];
 ; CHECK-NEXT:    { // callseq 3, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[8];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    st.param.b32 [param0+4], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b32 [param0+4], %r2;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), bars, (param0);
-; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
-; CHECK-NEXT:    ld.param.b32 %r4, [retval0+4];
+; CHECK-NEXT:    ld.param.b32 %r3, [retval0+4];
+; CHECK-NEXT:    ld.param.b32 %r4, [retval0];
 ; CHECK-NEXT:    } // callseq 3
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_s2f32_param_1];
-; CHECK-NEXT:    st.b32 [%rd1+4], %r4;
-; CHECK-NEXT:    st.b32 [%rd1], %r3;
+; CHECK-NEXT:    st.b32 [%rd1+4], %r3;
+; CHECK-NEXT:    st.b32 [%rd1], %r4;
 ; CHECK-NEXT:    ret;
   %call = tail call {float, float} @bars({float, float} %input)
   store {float, float} %call, ptr %output, align 4
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index ba5813c..b4641d0 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -208,13 +208,13 @@ define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; CHECK-NEXT:    st.param.b32 [param1], %r2;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
index ad9e4b0..b4934e1a 100644
--- a/llvm/test/CodeGen/NVPTX/byval-const-global.ll
+++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
@@ -13,12 +13,12 @@ define void @foo() {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.global.b64 %rd1, [G];
-; CHECK-NEXT:    ld.global.b64 %rd2, [G+8];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[16];
-; CHECK-NEXT:    st.param.b64 [param0], %rd1;
-; CHECK-NEXT:    st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT:    ld.global.b64 %rd1, [G+8];
+; CHECK-NEXT:    st.param.b64 [param0+8], %rd1;
+; CHECK-NEXT:    ld.global.b64 %rd2, [G];
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    call.uni bar, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 0cd7058..0eb7f64 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -44,11 +44,11 @@ entry:
   %arrayidx7 = getelementptr inbounds [16 x i8], ptr %buf, i64 0, i64 3
   store float %3, ptr %arrayidx7, align 4
 
-; CHECK:        .param .b64 param0;
-; CHECK-NEXT:   st.param.b64  [param0], %rd[[A_REG]]
-; CHECK-NEXT:   .param .b64 param1;
-; CHECK-NEXT:   st.param.b64  [param1], %rd[[SP_REG]]
-; CHECK-NEXT:   call.uni callee,
+; CHECK-DAG:   .param .b64 param0;
+; CHECK-DAG:   .param .b64 param1;
+; CHECK-DAG:   st.param.b64  [param0], %rd[[A_REG]]
+; CHECK-DAG:   st.param.b64  [param1], %rd[[SP_REG]]
+; CHECK:       call.uni callee,
 
   call void @callee(ptr %a, ptr %buf) #2
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
index f67145d..483d48a 100644
--- a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll
@@ -14,11 +14,11 @@ target triple = "nvptx64-nvidia-cuda"
 %complex_half = type { half, half }
 
 ; CHECK: .param .align 2 .b8 param2[4];
-; CHECK: st.param.b16   [param2], %rs1;
-; CHECK: st.param.b16   [param2+2], %rs2;
 ; CHECK: .param .align 2 .b8 retval0[4];
-; CHECK-NEXT: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]);
-; CHECK-NEXT: call (retval0),
+; CHECK-DAG: st.param.b16   [param2], %rs{{[0-9]+}};
+; CHECK-DAG: st.param.b16   [param2+2], %rs{{[0-9]+}};
+; CHECK: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]);
+; CHECK: call (retval0),
 define weak_odr void @foo() {
 entry:
   %call.i.i.i = tail call %"class.complex" @_Z20__spirv_GroupCMulKHRjjN5__spv12complex_halfE(i32 0, i32 0, ptr byval(%"class.complex") null)
@@ -36,10 +36,10 @@ define internal void @callee(ptr byval(%"class.complex") %byval_arg) {
 }
 define void @boom() {
   %fp = call ptr @usefp(ptr @callee)
-  ; CHECK: .param .align 2 .b8 param0[4];
-  ; CHECK: st.param.b16 [param0], %rs1;
-  ; CHECK: st.param.b16 [param0+2], %rs2;
-  ; CHECK: .callprototype ()_ (.param .align 2 .b8 _[4]);
+  ; CHECK-DAG: .param .align 2 .b8 param0[4];
+  ; CHECK-DAG: st.param.b16 [param0], %rs{{[0-9]+}};
+  ; CHECK-DAG: st.param.b16 [param0+2], %rs{{[0-9]+}};
+  ; CHECK-DAG: .callprototype ()_ (.param .align 2 .b8 _[4]);
   call void %fp(ptr byval(%"class.complex") null)
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
index 2232810..da303b7 100644
--- a/llvm/test/CodeGen/NVPTX/combine-mad.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -199,10 +199,10 @@ define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    add.s32 %r5, %r3, %r4;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b32 param0;
-; CHECK-NEXT:    st.param.b32 [param0], %r3;
 ; CHECK-NEXT:    .param .b32 param1;
-; CHECK-NEXT:    st.param.b32 [param1], %r5;
 ; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    st.param.b32 [param0], %r3;
+; CHECK-NEXT:    st.param.b32 [param1], %r5;
 ; CHECK-NEXT:    call.uni (retval0), use, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-NEXT:    } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll
index b44ae47..9338172d 100644
--- a/llvm/test/CodeGen/NVPTX/compare-int.ll
+++ b/llvm/test/CodeGen/NVPTX/compare-int.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
@@ -11,90 +12,180 @@
 ;;; i64
 
 define i64 @icmp_eq_i64(i64 %a, i64 %b) {
-; CHECK: setp.eq.b64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_eq_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_eq_i64_param_1];
+; CHECK-NEXT:    setp.eq.b64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp eq i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_ne_i64(i64 %a, i64 %b) {
-; CHECK: setp.ne.b64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_ne_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_ne_i64_param_1];
+; CHECK-NEXT:    setp.ne.b64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ne i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_ugt_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_ugt_i64_param_1];
+; CHECK-NEXT:    setp.gt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ugt i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_uge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_uge_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_uge_i64_param_1];
+; CHECK-NEXT:    setp.ge.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp uge i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_ult_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_ult_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_ult_i64_param_1];
+; CHECK-NEXT:    setp.lt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ult i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_ule_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_ule_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_ule_i64_param_1];
+; CHECK-NEXT:    setp.le.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ule i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_sgt_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_sgt_i64_param_1];
+; CHECK-NEXT:    setp.gt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp sgt i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_sge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_sge_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_sge_i64_param_1];
+; CHECK-NEXT:    setp.ge.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp sge i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_slt_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_slt_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_slt_i64_param_1];
+; CHECK-NEXT:    setp.lt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp slt i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
 }
 
 define i64 @icmp_sle_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
-; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [icmp_sle_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [icmp_sle_i64_param_1];
+; CHECK-NEXT:    setp.le.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT:    selp.b64 %rd3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %cmp = icmp sle i64 %a, %b
   %ret = zext i1 %cmp to i64
   ret i64 %ret
@@ -103,90 +194,180 @@ define i64 @icmp_sle_i64(i64 %a, i64 %b) {
 ;;; i32
 
 define i32 @icmp_eq_i32(i32 %a, i32 %b) {
-; CHECK: setp.eq.b32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_eq_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_eq_i32_param_1];
+; CHECK-NEXT:    setp.eq.b32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp eq i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_ne_i32(i32 %a, i32 %b) {
-; CHECK: setp.ne.b32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_ne_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_ne_i32_param_1];
+; CHECK-NEXT:    setp.ne.b32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ne i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_ugt_i32(i32 %a, i32 %b) {
-; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_ugt_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_ugt_i32_param_1];
+; CHECK-NEXT:    setp.gt.u32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ugt i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_uge_i32(i32 %a, i32 %b) {
-; CHECK: setp.ge.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_uge_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_uge_i32_param_1];
+; CHECK-NEXT:    setp.ge.u32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp uge i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_ult_i32(i32 %a, i32 %b) {
-; CHECK: setp.lt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_ult_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_ult_i32_param_1];
+; CHECK-NEXT:    setp.lt.u32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ult i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_ule_i32(i32 %a, i32 %b) {
-; CHECK: setp.le.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_ule_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_ule_i32_param_1];
+; CHECK-NEXT:    setp.le.u32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp ule i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_sgt_i32(i32 %a, i32 %b) {
-; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_sgt_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_sgt_i32_param_1];
+; CHECK-NEXT:    setp.gt.s32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp sgt i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_sge_i32(i32 %a, i32 %b) {
-; CHECK: setp.ge.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_sge_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_sge_i32_param_1];
+; CHECK-NEXT:    setp.ge.s32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp sge i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_slt_i32(i32 %a, i32 %b) {
-; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_slt_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_slt_i32_param_1];
+; CHECK-NEXT:    setp.lt.s32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp slt i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
 }
 
 define i32 @icmp_sle_i32(i32 %a, i32 %b) {
-; CHECK: setp.le.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_sle_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_sle_i32_param_1];
+; CHECK-NEXT:    setp.le.s32 %p1, %r1, %r2;
+; CHECK-NEXT:    selp.b32 %r3, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %cmp = icmp sle i32 %a, %b
   %ret = zext i1 %cmp to i32
   ret i32 %ret
@@ -196,90 +377,190 @@ define i32 @icmp_sle_i32(i32 %a, i32 %b) {
 ;;; i16
 
 define i16 @icmp_eq_i16(i16 %a, i16 %b) {
-; CHECK: setp.eq.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_eq_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_eq_i16_param_1];
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp eq i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_ne_i16(i16 %a, i16 %b) {
-; CHECK: setp.ne.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_ne_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_ne_i16_param_1];
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ne i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_ugt_i16(i16 %a, i16 %b) {
-; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_ugt_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_ugt_i16_param_1];
+; CHECK-NEXT:    setp.gt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ugt i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_uge_i16(i16 %a, i16 %b) {
-; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_uge_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_uge_i16_param_1];
+; CHECK-NEXT:    setp.ge.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp uge i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_ult_i16(i16 %a, i16 %b) {
-; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_ult_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_ult_i16_param_1];
+; CHECK-NEXT:    setp.lt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ult i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_ule_i16(i16 %a, i16 %b) {
-; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_ule_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_ule_i16_param_1];
+; CHECK-NEXT:    setp.le.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ule i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_sgt_i16(i16 %a, i16 %b) {
-; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_sgt_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_sgt_i16_param_1];
+; CHECK-NEXT:    setp.gt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp sgt i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_sge_i16(i16 %a, i16 %b) {
-; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_sge_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_sge_i16_param_1];
+; CHECK-NEXT:    setp.ge.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp sge i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_slt_i16(i16 %a, i16 %b) {
-; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_slt_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_slt_i16_param_1];
+; CHECK-NEXT:    setp.lt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp slt i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
 }
 
 define i16 @icmp_sle_i16(i16 %a, i16 %b) {
-; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [icmp_sle_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [icmp_sle_i16_param_1];
+; CHECK-NEXT:    setp.le.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp sle i16 %a, %b
   %ret = zext i1 %cmp to i16
   ret i16 %ret
@@ -290,9 +571,19 @@ define i16 @icmp_sle_i16(i16 %a, i16 %b) {
 
 define i8 @icmp_eq_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.eq.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_eq_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [icmp_eq_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [icmp_eq_i8_param_1];
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp eq i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -300,9 +591,19 @@ define i8 @icmp_eq_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_ne_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.ne.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ne_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [icmp_ne_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [icmp_ne_i8_param_1];
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ne i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -310,9 +611,19 @@ define i8 @icmp_ne_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_ugt_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ugt_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [icmp_ugt_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [icmp_ugt_i8_param_1];
+; CHECK-NEXT:    setp.gt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ugt i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -320,9 +631,19 @@ define i8 @icmp_ugt_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_uge_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_uge_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [icmp_uge_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [icmp_uge_i8_param_1];
+; CHECK-NEXT:    setp.ge.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp uge i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -330,9 +651,19 @@ define i8 @icmp_uge_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_ult_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ult_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [icmp_ult_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [icmp_ult_i8_param_1];
+; CHECK-NEXT:    setp.lt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ult i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -340,9 +671,19 @@ define i8 @icmp_ult_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_ule_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_ule_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [icmp_ule_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [icmp_ule_i8_param_1];
+; CHECK-NEXT:    setp.le.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp ule i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -350,9 +691,19 @@ define i8 @icmp_ule_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_sgt_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sgt_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.s8 %rs1, [icmp_sgt_i8_param_0];
+; CHECK-NEXT:    ld.param.s8 %rs2, [icmp_sgt_i8_param_1];
+; CHECK-NEXT:    setp.gt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp sgt i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -360,9 +711,19 @@ define i8 @icmp_sgt_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_sge_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sge_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.s8 %rs1, [icmp_sge_i8_param_0];
+; CHECK-NEXT:    ld.param.s8 %rs2, [icmp_sge_i8_param_1];
+; CHECK-NEXT:    setp.ge.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp sge i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -370,9 +731,19 @@ define i8 @icmp_sge_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_slt_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_slt_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.s8 %rs1, [icmp_slt_i8_param_0];
+; CHECK-NEXT:    ld.param.s8 %rs2, [icmp_slt_i8_param_1];
+; CHECK-NEXT:    setp.lt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp slt i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
@@ -380,9 +751,19 @@ define i8 @icmp_slt_i8(i8 %a, i8 %b) {
 
 define i8 @icmp_sle_i8(i8 %a, i8 %b) {
 ; Comparison happens in 16-bit
-; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
-; CHECK: ret
+; CHECK-LABEL: icmp_sle_i8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.s8 %rs1, [icmp_sle_i8_param_0];
+; CHECK-NEXT:    ld.param.s8 %rs2, [icmp_sle_i8_param_1];
+; CHECK-NEXT:    setp.le.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
   %cmp = icmp sle i8 %a, %b
   %ret = zext i1 %cmp to i8
   ret i8 %ret
diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
index d1b478d..48209a8 100644
--- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %}
 
@@ -7,52 +8,203 @@ declare i64 @callee_variadic(ptr %p, ...);
 
 define %struct.64 @test_return_type_mismatch(ptr %p) {
 ; CHECK-LABEL: test_return_type_mismatch(
-; CHECK:         .param .align 1 .b8 retval0[8];
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<40>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_return_type_mismatch_param_0];
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    .param .align 1 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    prototype_0 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
-; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0), prototype_0;
+; CHECK-NEXT:    mov.b64 %rd1, callee;
+; CHECK-NEXT:    call (retval0), %rd1, (param0), prototype_0;
+; CHECK-NEXT:    ld.param.b8 %rd3, [retval0+7];
+; CHECK-NEXT:    ld.param.b8 %rd4, [retval0+6];
+; CHECK-NEXT:    ld.param.b8 %rd5, [retval0+5];
+; CHECK-NEXT:    ld.param.b8 %rd6, [retval0+4];
+; CHECK-NEXT:    ld.param.b8 %rd7, [retval0+3];
+; CHECK-NEXT:    ld.param.b8 %rd8, [retval0+2];
+; CHECK-NEXT:    ld.param.b8 %rd9, [retval0+1];
+; CHECK-NEXT:    ld.param.b8 %rd10, [retval0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    shl.b64 %rd13, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd14, %rd13, %rd10;
+; CHECK-NEXT:    shl.b64 %rd16, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd18, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd19, %rd18, %rd16;
+; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd14;
+; CHECK-NEXT:    shl.b64 %rd23, %rd5, 8;
+; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd6;
+; CHECK-NEXT:    shl.b64 %rd26, %rd4, 16;
+; CHECK-NEXT:    shl.b64 %rd28, %rd3, 24;
+; CHECK-NEXT:    or.b64 %rd29, %rd28, %rd26;
+; CHECK-NEXT:    or.b64 %rd30, %rd29, %rd24;
+; CHECK-NEXT:    shl.b64 %rd31, %rd30, 32;
+; CHECK-NEXT:    or.b64 %rd32, %rd31, %rd20;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rd10;
+; CHECK-NEXT:    shr.u64 %rd33, %rd32, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %rd33;
+; CHECK-NEXT:    shr.u64 %rd34, %rd32, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rd34;
+; CHECK-NEXT:    shr.u64 %rd35, %rd32, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rd35;
+; CHECK-NEXT:    shr.u64 %rd36, %rd32, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rd36;
+; CHECK-NEXT:    shr.u64 %rd37, %rd32, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rd37;
+; CHECK-NEXT:    shr.u64 %rd38, %rd32, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rd38;
+; CHECK-NEXT:    shr.u64 %rd39, %rd32, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rd39;
+; CHECK-NEXT:    ret;
   %ret = call %struct.64 @callee(ptr %p)
   ret %struct.64 %ret
 }
 
 define i64 @test_param_type_mismatch(ptr %p) {
 ; CHECK-LABEL: test_param_type_mismatch(
-; CHECK:         .param .b64 retval0;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    .param .b64 retval0;
 ; CHECK-NEXT:    prototype_1 : .callprototype (.param .b64 _) _ (.param .b64 _);
-; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0), prototype_1;
+; CHECK-NEXT:    st.param.b64 [param0], 7;
+; CHECK-NEXT:    mov.b64 %rd1, callee;
+; CHECK-NEXT:    call (retval0), %rd1, (param0), prototype_1;
+; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = call i64 @callee(i64 7)
   ret i64 %ret
 }
 
 define i64 @test_param_count_mismatch(ptr %p) {
 ; CHECK-LABEL: test_param_count_mismatch(
-; CHECK:         .param .b64 retval0;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_param_count_mismatch_param_0];
+; CHECK-NEXT:    { // callseq 2, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param .b64 _);
-; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0, param1), prototype_2;
+; CHECK-NEXT:    st.param.b64 [param1], 7;
+; CHECK-NEXT:    mov.b64 %rd1, callee;
+; CHECK-NEXT:    call (retval0), %rd1, (param0, param1), prototype_2;
+; CHECK-NEXT:    ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT:    } // callseq 2
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT:    ret;
   %ret = call i64 @callee(ptr %p, i64 7)
   ret i64 %ret
 }
 
 define %struct.64 @test_return_type_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_return_type_mismatch_variadic(
-; CHECK:         .param .align 1 .b8 retval0[8];
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<40>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_return_type_mismatch_variadic_param_0];
+; CHECK-NEXT:    { // callseq 3, 0
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    .param .align 1 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    prototype_3 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _);
-; CHECK-NEXT:    call (retval0), %rd{{[0-9]+}}, (param0), prototype_3;
+; CHECK-NEXT:    mov.b64 %rd1, callee_variadic;
+; CHECK-NEXT:    call (retval0), %rd1, (param0), prototype_3;
+; CHECK-NEXT:    ld.param.b8 %rd3, [retval0+7];
+; CHECK-NEXT:    ld.param.b8 %rd4, [retval0+6];
+; CHECK-NEXT:    ld.param.b8 %rd5, [retval0+5];
+; CHECK-NEXT:    ld.param.b8 %rd6, [retval0+4];
+; CHECK-NEXT:    ld.param.b8 %rd7, [retval0+3];
+; CHECK-NEXT:    ld.param.b8 %rd8, [retval0+2];
+; CHECK-NEXT:    ld.param.b8 %rd9, [retval0+1];
+; CHECK-NEXT:    ld.param.b8 %rd10, [retval0];
+; CHECK-NEXT:    } // callseq 3
+; CHECK-NEXT:    shl.b64 %rd13, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd14, %rd13, %rd10;
+; CHECK-NEXT:    shl.b64 %rd16, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd18, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd19, %rd18, %rd16;
+; CHECK-NEXT:    or.b64 %rd20, %rd19, %rd14;
+; CHECK-NEXT:    shl.b64 %rd23, %rd5, 8;
+; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd6;
+; CHECK-NEXT:    shl.b64 %rd26, %rd4, 16;
+; CHECK-NEXT:    shl.b64 %rd28, %rd3, 24;
+; CHECK-NEXT:    or.b64 %rd29, %rd28, %rd26;
+; CHECK-NEXT:    or.b64 %rd30, %rd29, %rd24;
+; CHECK-NEXT:    shl.b64 %rd31, %rd30, 32;
+; CHECK-NEXT:    or.b64 %rd32, %rd31, %rd20;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rd10;
+; CHECK-NEXT:    shr.u64 %rd33, %rd32, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %rd33;
+; CHECK-NEXT:    shr.u64 %rd34, %rd32, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rd34;
+; CHECK-NEXT:    shr.u64 %rd35, %rd32, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %rd35;
+; CHECK-NEXT:    shr.u64 %rd36, %rd32, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rd36;
+; CHECK-NEXT:    shr.u64 %rd37, %rd32, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rd37;
+; CHECK-NEXT:    shr.u64 %rd38, %rd32, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rd38;
+; CHECK-NEXT:    shr.u64 %rd39, %rd32, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rd39;
+; CHECK-NEXT:    ret;
   %ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p)
   ret %struct.64 %ret
 }
 
 define i64 @test_param_type_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_param_type_mismatch_variadic(
-; CHECK:         .param .b64 retval0;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_param_type_mismatch_variadic_param_0];
+; CHECK-NEXT:    { // callseq 4, 0
+; CHECK-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
+; CHECK-NEXT:    st.param.b64 [param1], 7;
 ; CHECK-NEXT:    call.uni (retval0), callee_variadic, (param0, param1);
+; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT:    } // callseq 4
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
   ret i64 %ret
 }
 
 define i64 @test_param_count_mismatch_variadic(ptr %p) {
 ; CHECK-LABEL: test_param_count_mismatch_variadic(
-; CHECK:         .param .b64 retval0;
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_param_count_mismatch_variadic_param_0];
+; CHECK-NEXT:    { // callseq 5, 0
+; CHECK-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NEXT:    .param .b64 param0;
+; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
+; CHECK-NEXT:    st.param.b64 [param1], 7;
 ; CHECK-NEXT:    call.uni (retval0), callee_variadic, (param0, param1);
+; CHECK-NEXT:    ld.param.b64 %rd2, [retval0];
+; CHECK-NEXT:    } // callseq 5
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7)
   ret i64 %ret
 }
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 4d2ba7d..06fb8d2 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -22,8 +22,8 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
 ; CHECK-32-NEXT:    cvta.local.u32 %r5, %r4;
 ; CHECK-32-NEXT:    { // callseq 0, 0
 ; CHECK-32-NEXT:    .param .b32 param0;
-; CHECK-32-NEXT:    st.param.b32 [param0], %r5;
 ; CHECK-32-NEXT:    .param .b32 retval0;
+; CHECK-32-NEXT:    st.param.b32 [param0], %r5;
 ; CHECK-32-NEXT:    call.uni (retval0), bar, (param0);
 ; CHECK-32-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-32-NEXT:    } // callseq 0
@@ -43,8 +43,8 @@ define i32 @test_dynamic_stackalloc(i64 %n) {
 ; CHECK-64-NEXT:    cvta.local.u64 %rd5, %rd4;
 ; CHECK-64-NEXT:    { // callseq 0, 0
 ; CHECK-64-NEXT:    .param .b64 param0;
-; CHECK-64-NEXT:    st.param.b64 [param0], %rd5;
 ; CHECK-64-NEXT:    .param .b32 retval0;
+; CHECK-64-NEXT:    st.param.b64 [param0], %rd5;
 ; CHECK-64-NEXT:    call.uni (retval0), bar, (param0);
 ; CHECK-64-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-64-NEXT:    } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 8918fbd..d4fcea3 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -462,10 +462,10 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    st.param.b32 [param1], %r2;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 0
@@ -485,10 +485,10 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    st.param.b32 [param1], %r1;
+; CHECK-NEXT:    st.param.b32 [param0], %r2;
 ; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 1
@@ -508,10 +508,10 @@ define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 2, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.b32 [param0], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
-; CHECK-NEXT:    st.param.b32 [param1], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
+; CHECK-NEXT:    st.param.b32 [param1], %r1;
+; CHECK-NEXT:    st.param.b32 [param0], %r2;
 ; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 2
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 30afd69..b84a0ec 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -859,10 +859,10 @@ define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_call_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 param1[8];
-; CHECK-NEXT:    st.param.b64 [param1], %rd2;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b64 [param1], %rd2;
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b64 %rd3, [retval0];
 ; CHECK-NEXT:    } // callseq 0
@@ -882,10 +882,10 @@ define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_call_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    .param .align 8 .b8 param1[8];
-; CHECK-NEXT:    st.param.b64 [param1], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b64 %rd3, [retval0];
 ; CHECK-NEXT:    } // callseq 1
@@ -905,10 +905,10 @@ define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 2, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    .param .align 8 .b8 param1[8];
-; CHECK-NEXT:    st.param.b64 [param1], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-NEXT:    st.param.b64 [param0], %rd2;
 ; CHECK-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; CHECK-NEXT:    ld.param.b64 %rd3, [retval0];
 ; CHECK-NEXT:    } // callseq 2
diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll
index 5aa12b0..87274aa 100644
--- a/llvm/test/CodeGen/NVPTX/fma.ll
+++ b/llvm/test/CodeGen/NVPTX/fma.ll
@@ -36,10 +36,10 @@ define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
 ; CHECK-NEXT:    fma.rn.f32 %r6, %r1, %r2, %r5;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b32 param0;
-; CHECK-NEXT:    st.param.b32 [param0], %r4;
 ; CHECK-NEXT:    .param .b32 param1;
-; CHECK-NEXT:    st.param.b32 [param1], %r6;
 ; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    st.param.b32 [param1], %r6;
+; CHECK-NEXT:    st.param.b32 [param0], %r4;
 ; CHECK-NEXT:    call.uni (retval0), dummy_f32, (param0, param1);
 ; CHECK-NEXT:    ld.param.b32 %r7, [retval0];
 ; CHECK-NEXT:    } // callseq 0
@@ -83,10 +83,10 @@ define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
 ; CHECK-NEXT:    fma.rn.f64 %rd6, %rd1, %rd2, %rd5;
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .b64 param0;
-; CHECK-NEXT:    st.param.b64 [param0], %rd4;
 ; CHECK-NEXT:    .param .b64 param1;
-; CHECK-NEXT:    st.param.b64 [param1], %rd6;
 ; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    st.param.b64 [param1], %rd6;
+; CHECK-NEXT:    st.param.b64 [param0], %rd4;
 ; CHECK-NEXT:    call.uni (retval0), dummy_f64, (param0, param1);
 ; CHECK-NEXT:    ld.param.b64 %rd7, [retval0];
 ; CHECK-NEXT:    } // callseq 1
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index ed8f6b4..636e12b 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -64,9 +64,9 @@ define void @test_ld_param_byval(ptr byval(i32) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_ld_param_byval_param_0];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_ld_param_byval_param_0];
 ; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni byval_user, (param0);
 ; CHECK-NEXT:    } // callseq 1
diff --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll
index 4f4c2fe..79abca0 100644
--- a/llvm/test/CodeGen/NVPTX/i128-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-param.ll
@@ -29,11 +29,11 @@ start:
   ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1];
 
   ; CHECK:      { // callseq [[CALLSEQ_ID:[0-9]]], 0
-	; CHECK:      .param .align 16 .b8 param0[16];
-	; CHECK-NEXT: st.param.v2.b64 	[param0], {%[[REG0]], %[[REG1]]}
-	; CHECK:      .param .align 16 .b8 param1[16];
-	; CHECK-NEXT: st.param.v2.b64 	[param1], {%[[REG2]], %[[REG3]]}
-	; CHECK:      } // callseq [[CALLSEQ_ID]]
+  ; CHECK-DAG:  .param .align 16 .b8 param0[16];
+  ; CHECK-DAG:  .param .align 16 .b8 param1[16];
+  ; CHECK-DAG: st.param.v2.b64 	[param0], {%[[REG0]], %[[REG1]]}
+  ; CHECK-DAG: st.param.v2.b64 	[param1], {%[[REG2]], %[[REG3]]}
+  ; CHECK:      } // callseq [[CALLSEQ_ID]]
   call void @callee(i128 %0, i128 %1, ptr %2)
 
   ret void
@@ -48,11 +48,11 @@ start:
   ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1]
 
   ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
-	; CHECK: .param .align 16 .b8 param0[16];
-	; CHECK: st.param.v2.b64 	[param0], {%[[REG0]], %[[REG1]]}
-	; CHECK: .param .align 16 .b8 param1[16];
-  ; CHECK: st.param.v2.b64 	[param1], {%[[REG2]], %[[REG3]]}
-	; CHECK: } // callseq [[CALLSEQ_ID]]
+  ; CHECK-DAG: .param .align 16 .b8 param0[16];
+  ; CHECK-DAG: .param .align 16 .b8 param1[16];
+  ; CHECK-DAG: st.param.v2.b64 	[param0], {%[[REG0]], %[[REG1]]}
+  ; CHECK-DAG: st.param.v2.b64 	[param1], {%[[REG2]], %[[REG3]]}
+  ; CHECK: } // callseq [[CALLSEQ_ID]]
   call void @callee(i128 %0, i128 %1, ptr %2)
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 2b7a06c..74136bb 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -642,10 +642,10 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    ld.param.b32 %r1, [test_call_param_0];
 ; COMMON-NEXT:    { // callseq 0, 0
 ; COMMON-NEXT:    .param .align 4 .b8 param0[4];
-; COMMON-NEXT:    st.param.b32 [param0], %r1;
 ; COMMON-NEXT:    .param .align 4 .b8 param1[4];
-; COMMON-NEXT:    st.param.b32 [param1], %r2;
 ; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
+; COMMON-NEXT:    st.param.b32 [param1], %r2;
+; COMMON-NEXT:    st.param.b32 [param0], %r1;
 ; COMMON-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
 ; COMMON-NEXT:    } // callseq 0
@@ -665,10 +665,10 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
 ; COMMON-NEXT:    { // callseq 1, 0
 ; COMMON-NEXT:    .param .align 4 .b8 param0[4];
-; COMMON-NEXT:    st.param.b32 [param0], %r2;
 ; COMMON-NEXT:    .param .align 4 .b8 param1[4];
-; COMMON-NEXT:    st.param.b32 [param1], %r1;
 ; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
+; COMMON-NEXT:    st.param.b32 [param1], %r1;
+; COMMON-NEXT:    st.param.b32 [param0], %r2;
 ; COMMON-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
 ; COMMON-NEXT:    } // callseq 1
@@ -688,10 +688,10 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
 ; COMMON-NEXT:    { // callseq 2, 0
 ; COMMON-NEXT:    .param .align 4 .b8 param0[4];
-; COMMON-NEXT:    st.param.b32 [param0], %r2;
 ; COMMON-NEXT:    .param .align 4 .b8 param1[4];
-; COMMON-NEXT:    st.param.b32 [param1], %r1;
 ; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
+; COMMON-NEXT:    st.param.b32 [param1], %r1;
+; COMMON-NEXT:    st.param.b32 [param0], %r2;
 ; COMMON-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
 ; COMMON-NEXT:    } // callseq 2
diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
index 3edd4e4..98f94bb 100644
--- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
@@ -1,42 +1,107 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80        \
-; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | FileCheck  %s
-; RUN: %if ptxas %{                                                           \
-; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \
-; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN:   | %ptxas-verify -arch=sm_90                                          \
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all  \
+; RUN:     -verify-machineinstrs -O0 | FileCheck %s --check-prefixes=O0,COMMON
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all  \
+; RUN:     -verify-machineinstrs | FileCheck %s --check-prefixes=O3,COMMON
+; RUN: %if ptxas %{                                                            \
+; RUN:  llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \
+; RUN:     -verify-machineinstrs -O0                                           \
+; RUN:   | %ptxas-verify -arch=sm_90                                           \
+; RUN: %}
+; RUN: %if ptxas %{                                                            \
+; RUN:  llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \
+; RUN:     -verify-machineinstrs                                               \
+; RUN:   | %ptxas-verify -arch=sm_90                                           \
 ; RUN: %}
 
+target triple = "nvptx64-nvidia-cuda"
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) {
-; CHECK-LABEL: test_bitcast_2xi8_i16(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_bitcast_2xi8_i16_param_0];
-; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
-; CHECK-NEXT:    shl.b16 %rs3, %rs2, 8;
-; CHECK-NEXT:    or.b16 %rs4, %rs1, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r2, %rs4;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_bitcast_2xi8_i16(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<5>;
+; O0-NEXT:    .reg .b32 %r<3>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_bitcast_2xi8_i16_param_0];
+; O0-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; O0-NEXT:    shl.b16 %rs3, %rs2, 8;
+; O0-NEXT:    or.b16 %rs4, %rs1, %rs3;
+; O0-NEXT:    cvt.u32.u16 %r2, %rs4;
+; O0-NEXT:    st.param.b32 [func_retval0], %r2;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_bitcast_2xi8_i16(
+; O3:       {
+; O3-NEXT:    .reg .b32 %r<2>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b16 %r1, [test_bitcast_2xi8_i16_param_0];
+; O3-NEXT:    st.param.b32 [func_retval0], %r1;
+; O3-NEXT:    ret;
   %res = bitcast <2 x i8> %a to i16
   ret i16 %res
 }
 
 define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) {
-; CHECK-LABEL: test_bitcast_i16_2xi8(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
-; CHECK-NEXT:    ret;
+; O0-LABEL: test_bitcast_i16_2xi8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<2>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
+; O0-NEXT:    st.param.b16 [func_retval0], %rs1;
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_bitcast_i16_2xi8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<2>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
+; O3-NEXT:    st.param.b16 [func_retval0], %rs1;
+; O3-NEXT:    ret;
   %res = bitcast i16 %a to <2 x i8>
   ret <2 x i8> %res
 }
+
+define <2 x i8> @test_call_2xi8(<2 x i8> %a) {
+; O0-LABEL: test_call_2xi8(
+; O0:       {
+; O0-NEXT:    .reg .b16 %rs<7>;
+; O0-NEXT:    .reg .b32 %r<2>;
+; O0-EMPTY:
+; O0-NEXT:  // %bb.0:
+; O0-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0];
+; O0-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; O0-NEXT:    { // callseq 0, 0
+; O0-NEXT:    .param .align 2 .b8 param0[2];
+; O0-NEXT:    .param .align 2 .b8 retval0[2];
+; O0-NEXT:    st.param.v2.b8 [param0], {%rs1, %rs2};
+; O0-NEXT:    call.uni (retval0), test_call_2xi8, (param0);
+; O0-NEXT:    ld.param.v2.b8 {%rs3, %rs4}, [retval0];
+; O0-NEXT:    } // callseq 0
+; O0-NEXT:    st.param.v2.b8 [func_retval0], {%rs3, %rs4};
+; O0-NEXT:    ret;
+;
+; O3-LABEL: test_call_2xi8(
+; O3:       {
+; O3-NEXT:    .reg .b16 %rs<7>;
+; O3-EMPTY:
+; O3-NEXT:  // %bb.0:
+; O3-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0];
+; O3-NEXT:    { // callseq 0, 0
+; O3-NEXT:    .param .align 2 .b8 param0[2];
+; O3-NEXT:    .param .align 2 .b8 retval0[2];
+; O3-NEXT:    st.param.v2.b8 [param0], {%rs1, %rs2};
+; O3-NEXT:    call.uni (retval0), test_call_2xi8, (param0);
+; O3-NEXT:    ld.param.v2.b8 {%rs3, %rs4}, [retval0];
+; O3-NEXT:    } // callseq 0
+; O3-NEXT:    st.param.v2.b8 [func_retval0], {%rs3, %rs4};
+; O3-NEXT:    ret;
+  %res = call <2 x i8> @test_call_2xi8(<2 x i8> %a)
+  ret <2 x i8> %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index da99cec..06c2cc8 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1273,10 +1273,10 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O0-NEXT:    ld.param.b32 %r1, [test_call_param_0];
 ; O0-NEXT:    { // callseq 0, 0
 ; O0-NEXT:    .param .align 4 .b8 param0[4];
-; O0-NEXT:    st.param.b32 [param0], %r1;
 ; O0-NEXT:    .param .align 4 .b8 param1[4];
-; O0-NEXT:    st.param.b32 [param1], %r2;
 ; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    st.param.b32 [param1], %r2;
+; O0-NEXT:    st.param.b32 [param0], %r1;
 ; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; O0-NEXT:    ld.param.b32 %r3, [retval0];
 ; O0-NEXT:    } // callseq 0
@@ -1289,13 +1289,13 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O3-EMPTY:
 ; O3-NEXT:  // %bb.0:
 ; O3-NEXT:    ld.param.b32 %r1, [test_call_param_0];
-; O3-NEXT:    ld.param.b32 %r2, [test_call_param_1];
 ; O3-NEXT:    { // callseq 0, 0
 ; O3-NEXT:    .param .align 4 .b8 param0[4];
-; O3-NEXT:    st.param.b32 [param0], %r1;
 ; O3-NEXT:    .param .align 4 .b8 param1[4];
-; O3-NEXT:    st.param.b32 [param1], %r2;
 ; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; O3-NEXT:    st.param.b32 [param1], %r2;
+; O3-NEXT:    st.param.b32 [param0], %r1;
 ; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; O3-NEXT:    ld.param.b32 %r3, [retval0];
 ; O3-NEXT:    } // callseq 0
@@ -1315,10 +1315,10 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O0-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
 ; O0-NEXT:    { // callseq 1, 0
 ; O0-NEXT:    .param .align 4 .b8 param0[4];
-; O0-NEXT:    st.param.b32 [param0], %r2;
 ; O0-NEXT:    .param .align 4 .b8 param1[4];
-; O0-NEXT:    st.param.b32 [param1], %r1;
 ; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    st.param.b32 [param1], %r1;
+; O0-NEXT:    st.param.b32 [param0], %r2;
 ; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; O0-NEXT:    ld.param.b32 %r3, [retval0];
 ; O0-NEXT:    } // callseq 1
@@ -1331,13 +1331,13 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O3-EMPTY:
 ; O3-NEXT:  // %bb.0:
 ; O3-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
-; O3-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
 ; O3-NEXT:    { // callseq 1, 0
 ; O3-NEXT:    .param .align 4 .b8 param0[4];
-; O3-NEXT:    st.param.b32 [param0], %r2;
 ; O3-NEXT:    .param .align 4 .b8 param1[4];
-; O3-NEXT:    st.param.b32 [param1], %r1;
 ; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    st.param.b32 [param1], %r1;
+; O3-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; O3-NEXT:    st.param.b32 [param0], %r2;
 ; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; O3-NEXT:    ld.param.b32 %r3, [retval0];
 ; O3-NEXT:    } // callseq 1
@@ -1357,10 +1357,10 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O0-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
 ; O0-NEXT:    { // callseq 2, 0
 ; O0-NEXT:    .param .align 4 .b8 param0[4];
-; O0-NEXT:    st.param.b32 [param0], %r2;
 ; O0-NEXT:    .param .align 4 .b8 param1[4];
-; O0-NEXT:    st.param.b32 [param1], %r1;
 ; O0-NEXT:    .param .align 4 .b8 retval0[4];
+; O0-NEXT:    st.param.b32 [param1], %r1;
+; O0-NEXT:    st.param.b32 [param0], %r2;
 ; O0-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; O0-NEXT:    ld.param.b32 %r3, [retval0];
 ; O0-NEXT:    } // callseq 2
@@ -1373,13 +1373,13 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; O3-EMPTY:
 ; O3-NEXT:  // %bb.0:
 ; O3-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
-; O3-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
 ; O3-NEXT:    { // callseq 2, 0
 ; O3-NEXT:    .param .align 4 .b8 param0[4];
-; O3-NEXT:    st.param.b32 [param0], %r2;
 ; O3-NEXT:    .param .align 4 .b8 param1[4];
-; O3-NEXT:    st.param.b32 [param1], %r1;
 ; O3-NEXT:    .param .align 4 .b8 retval0[4];
+; O3-NEXT:    st.param.b32 [param1], %r1;
+; O3-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; O3-NEXT:    st.param.b32 [param0], %r2;
 ; O3-NEXT:    call.uni (retval0), test_callee, (param0, param1);
 ; O3-NEXT:    ld.param.b32 %r3, [retval0];
 ; O3-NEXT:    } // callseq 2
diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll
index be84f9b..a3bf892 100644
--- a/llvm/test/CodeGen/NVPTX/idioms.ll
+++ b/llvm/test/CodeGen/NVPTX/idioms.ll
@@ -173,8 +173,8 @@ define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    shr.s32 %r2, %r1, 16;
 ; CHECK-NEXT:    shr.u32 %r3, %r2, 16;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %r2;
 ; CHECK-NEXT:    st.param.b16 [func_retval0+2], %r3;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   call void @escape_int(i32 %i); // Force %i to be loaded completely.
   %i1 = ashr i32 %i, 16
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
index eae0321..782e672 100644
--- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -23,15 +23,15 @@ define internal i32 @foo() {
 ; CHECK-NEXT:    mov.b64 %SPL, __local_depot0;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.global.b64 %rd1, [ptr];
-; CHECK-NEXT:    add.u64 %rd3, %SPL, 1;
-; CHECK-NEXT:    ld.local.b8 %rs1, [%rd3];
-; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 1 .b8 param0[1];
-; CHECK-NEXT:    st.param.b8 [param0], %rs1;
 ; CHECK-NEXT:    .param .b64 param1;
-; CHECK-NEXT:    st.param.b64 [param1], %rd4;
 ; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
+; CHECK-NEXT:    st.param.b64 [param1], %rd2;
+; CHECK-NEXT:    add.u64 %rd4, %SPL, 1;
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd4];
+; CHECK-NEXT:    st.param.b8 [param0], %rs1;
 ; CHECK-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
 ; CHECK-NEXT:    call (retval0), %rd1, (param0, param1), prototype_0;
 ; CHECK-NEXT:    ld.param.b32 %r1, [retval0];
@@ -60,15 +60,15 @@ define internal i32 @bar() {
 ; CHECK-NEXT:    mov.b64 %SPL, __local_depot1;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-NEXT:    ld.global.b64 %rd1, [ptr];
-; CHECK-NEXT:    add.u64 %rd3, %SPL, 8;
-; CHECK-NEXT:    ld.local.b64 %rd4, [%rd3];
-; CHECK-NEXT:    add.u64 %rd5, %SP, 0;
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.b64 [param0], %rd4;
 ; CHECK-NEXT:    .param .b64 param1;
-; CHECK-NEXT:    st.param.b64 [param1], %rd5;
 ; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
+; CHECK-NEXT:    st.param.b64 [param1], %rd2;
+; CHECK-NEXT:    add.u64 %rd4, %SPL, 8;
+; CHECK-NEXT:    ld.local.b64 %rd5, [%rd4];
+; CHECK-NEXT:    st.param.b64 [param0], %rd5;
 ; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
 ; CHECK-NEXT:    call (retval0), %rd1, (param0, param1), prototype_1;
 ; CHECK-NEXT:    ld.param.b32 %r1, [retval0];
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 321a624..38185c7b 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -121,20 +121,18 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
 define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<2>;
 ; PTX-NEXT:    .reg .b64 %rd<4>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd2, grid_const_escape_param_0;
 ; PTX-NEXT:    cvta.param.u64 %rd3, %rd2;
-; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 0, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0], %rd3;
 ; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    st.param.b64 [param0], %rd3;
 ; PTX-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    call (retval0), %rd1, (param0), prototype_0;
-; PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_escape(
@@ -153,7 +151,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
 ; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
-; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b32 %r<2>;
 ; PTX-NEXT:    .reg .b64 %rd<8>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
@@ -167,18 +165,17 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
 ; PTX-NEXT:    add.u64 %rd6, %SP, 0;
 ; PTX-NEXT:    add.u64 %rd7, %SPL, 0;
 ; PTX-NEXT:    st.local.b32 [%rd7], %r1;
-; PTX-NEXT:    mov.b64 %rd1, escape3;
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    .param .b64 param1;
-; PTX-NEXT:    st.param.b64 [param1], %rd6;
 ; PTX-NEXT:    .param .b64 param2;
-; PTX-NEXT:    st.param.b64 [param2], %rd4;
 ; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    st.param.b64 [param2], %rd4;
+; PTX-NEXT:    st.param.b64 [param1], %rd6;
+; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
+; PTX-NEXT:    mov.b64 %rd1, escape3;
 ; PTX-NEXT:    call (retval0), %rd1, (param0, param1, param2), prototype_1;
-; PTX-NEXT:    ld.param.b32 %r2, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
@@ -255,7 +252,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
 define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b32 %r<3>;
 ; PTX-NEXT:    .reg .b64 %rd<6>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
@@ -266,14 +263,13 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
 ; PTX-NEXT:    ld.param.b32 %r1, [grid_const_partial_escape_param_0];
 ; PTX-NEXT:    add.s32 %r2, %r1, %r1;
 ; PTX-NEXT:    st.global.b32 [%rd4], %r2;
-; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 2, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    call (retval0), %rd1, (param0), prototype_2;
-; PTX-NEXT:    ld.param.b32 %r3, [retval0];
 ; PTX-NEXT:    } // callseq 2
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
@@ -295,7 +291,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
 define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<5>;
+; PTX-NEXT:    .reg .b32 %r<4>;
 ; PTX-NEXT:    .reg .b64 %rd<6>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
@@ -307,14 +303,13 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
 ; PTX-NEXT:    ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4];
 ; PTX-NEXT:    st.global.b64 [%rd4], %rd5;
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
-; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 3, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
+; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    call (retval0), %rd1, (param0), prototype_3;
-; PTX-NEXT:    ld.param.b32 %r4, [retval0];
 ; PTX-NEXT:    } // callseq 3
 ; PTX-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTX-NEXT:    ret;
@@ -535,9 +530,9 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    .reg .b32 %r<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.b32 %r1, [test_forward_byval_arg_param_0];
 ; PTX-NEXT:    { // callseq 4, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
+; PTX-NEXT:    ld.param.b32 %r1, [test_forward_byval_arg_param_0];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
 ; PTX-NEXT:    call.uni device_func, (param0);
 ; PTX-NEXT:    } // callseq 4
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index c165de7..7c029ab 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -31,7 +31,7 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
 ; PTX-LABEL: load_alignment(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
-; PTX-NEXT:    .reg .b64 %rd<7>;
+; PTX-NEXT:    .reg .b64 %rd<6>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %rd1, load_alignment_param_0;
@@ -45,10 +45,9 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
 ; PTX-NEXT:    st.b32 [%rd3], %r3;
 ; PTX-NEXT:    { // callseq 0, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    .param .b64 retval0;
+; PTX-NEXT:    st.param.b64 [param0], %rd5;
 ; PTX-NEXT:    call.uni (retval0), escape, (param0);
-; PTX-NEXT:    ld.param.b64 %rd6, [retval0];
 ; PTX-NEXT:    } // callseq 0
 ; PTX-NEXT:    ret;
 entry:
@@ -76,17 +75,16 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
 ;
 ; PTX-LABEL: load_padding(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %rd<4>;
+; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd1, load_padding_param_0;
 ; PTX-NEXT:    cvta.local.u64 %rd2, %rd1;
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
-; PTX-NEXT:    st.param.b64 [param0], %rd2;
 ; PTX-NEXT:    .param .b64 retval0;
+; PTX-NEXT:    st.param.b64 [param0], %rd2;
 ; PTX-NEXT:    call.uni (retval0), escape, (param0);
-; PTX-NEXT:    ld.param.b64 %rd3, [retval0];
 ; PTX-NEXT:    } // callseq 1
 ; PTX-NEXT:    ret;
   %tmp = call ptr @escape(ptr nonnull align 16 %arg)
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 4784d70..20a3519 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -911,9 +911,9 @@ define void @device_func(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    .reg .b64 %rd<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.b32 %r1, [device_func_param_0];
 ; PTX-NEXT:    { // callseq 3, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
+; PTX-NEXT:    ld.param.b32 %r1, [device_func_param_0];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
 ; PTX-NEXT:    call.uni device_func, (param0);
 ; PTX-NEXT:    } // callseq 3
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index 8401f45..b2994c0 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -8,7 +8,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-LABEL: wombat(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<11>;
-; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %bb
 ; CHECK-NEXT:    ld.param.b32 %r4, [wombat_param_2];
@@ -19,19 +19,18 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b64 param0;
-; CHECK-NEXT:    st.param.b64 [param0], 0d0000000000000000;
 ; CHECK-NEXT:    .param .b64 retval0;
+; CHECK-NEXT:    st.param.b64 [param0], 0;
 ; CHECK-NEXT:    call.uni (retval0), quux, (param0);
-; CHECK-NEXT:    ld.param.b64 %rd1, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
 ; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
 ; CHECK-NEXT:    mul.lo.s32 %r9, %r2, %r8;
-; CHECK-NEXT:    cvt.rn.f64.s32 %rd2, %r9;
-; CHECK-NEXT:    cvt.rn.f64.u32 %rd3, %r10;
-; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, %rd2;
-; CHECK-NEXT:    mov.b64 %rd5, 0;
-; CHECK-NEXT:    st.global.b64 [%rd5], %rd4;
+; CHECK-NEXT:    cvt.rn.f64.s32 %rd1, %r9;
+; CHECK-NEXT:    cvt.rn.f64.u32 %rd2, %r10;
+; CHECK-NEXT:    add.rn.f64 %rd3, %rd2, %rd1;
+; CHECK-NEXT:    mov.b64 %rd4, 0;
+; CHECK-NEXT:    st.global.b64 [%rd4], %rd3;
 ; CHECK-NEXT:    mov.b32 %r10, 1;
 ; CHECK-NEXT:    bra.uni $L__BB0_1;
 bb:
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index 4fa1235..c5ea9f8 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -18,16 +18,16 @@ define i32 @test(%struct.1float alignstack(32) %data) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_param_0];
-; CHECK-NEXT:    shr.u32 %r2, %r1, 8;
-; CHECK-NEXT:    shr.u32 %r3, %r1, 16;
-; CHECK-NEXT:    shr.u32 %r4, %r1, 24;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 1 .b8 param0[4];
+; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    st.param.b8 [param0], %r1;
+; CHECK-NEXT:    shr.u32 %r2, %r1, 8;
 ; CHECK-NEXT:    st.param.b8 [param0+1], %r2;
+; CHECK-NEXT:    shr.u32 %r3, %r1, 16;
 ; CHECK-NEXT:    st.param.b8 [param0+2], %r3;
+; CHECK-NEXT:    shr.u32 %r4, %r3, 8;
 ; CHECK-NEXT:    st.param.b8 [param0+3], %r4;
-; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    call.uni (retval0), callee, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r5, [retval0];
 ; CHECK-NEXT:    } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 6c52bfd..db3fbbc 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -27,10 +27,10 @@
 ; CHECK:      ld.param.b8 [[A8:%rs[0-9]+]], [test_i1_param_0];
 ; CHECK:      and.b16 [[A:%rs[0-9]+]], [[A8]], 1;
 ; CHECK:      setp.ne.b16 %p1, [[A]], 0
+; CHECK-DAG:  .param .b32 param0;
+; CHECK-DAG:  .param .b32 retval0;
 ; CHECK:      cvt.u32.u16 [[B:%r[0-9]+]], [[A8]]
-; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], [[B]]
-; CHECK:      .param .b32 retval0;
+; CHECK-DAG:  st.param.b32    [param0], [[B]]
 ; CHECK:      call.uni (retval0), test_i1,
 ; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R8]];
@@ -47,11 +47,11 @@ define i1 @test_i1(i1 %a) {
 ; CHECK-NEXT: .param .b32 test_i1s_param_0
 ; CHECK:      ld.param.b8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
 ; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
+; CHECK:      .param .b32 param0;
+; CHECK:      .param .b32 retval0;
 ; CHECK:      and.b32         [[A1:%r[0-9]+]], [[A32]], 1;
 ; CHECK:      neg.s32         [[A:%r[0-9]+]], [[A1]];
-; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[A]];
-; CHECK:      .param .b32 retval0;
 ; CHECK:      call.uni
 ; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0];
 ; CHECK:      and.b32         [[R1:%r[0-9]+]], [[R8]], 1;
@@ -70,9 +70,9 @@ define signext i1 @test_i1s(i1 signext %a) {
 ; CHECK-DAG:  ld.param.b8     [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
 ; CHECK-DAG:  ld.param.b8     [[E0:%rs[0-9]+]], [test_v3i1_param_0]
 ; CHECK:      .param .align 1 .b8 param0[1];
+; CHECK:      .param .align 1 .b8 retval0[1];
 ; CHECK-DAG:  st.param.b8     [param0], [[E0]];
 ; CHECK-DAG:  st.param.b8     [param0+2], [[E2]];
-; CHECK:      .param .align 1 .b8 retval0[1];
 ; CHECK:      call.uni (retval0), test_v3i1,
 ; CHECK-DAG:  ld.param.b8     [[RE0:%rs[0-9]+]], [retval0];
 ; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
@@ -89,8 +89,8 @@ define <3 x i1> @test_v3i1(<3 x i1> %a) {
 ; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1]
 ; CHECK:      ld.param.b8 [[E0:%rs[0-9]+]], [test_v4i1_param_0]
 ; CHECK:      .param .align 1 .b8 param0[1];
-; CHECK:      st.param.b8  [param0], [[E0]];
 ; CHECK:      .param .align 1 .b8 retval0[1];
+; CHECK:      st.param.b8  [param0], [[E0]];
 ; CHECK:      call.uni (retval0), test_v4i1,
 ; CHECK:      ld.param.b8  [[RE0:%rs[0-9]+]], [retval0];
 ; CHECK:      ld.param.b8  [[RE1:%rs[0-9]+]], [retval0+1];
@@ -112,9 +112,9 @@ define <4 x i1> @test_v4i1(<4 x i1> %a) {
 ; CHECK-DAG:  ld.param.b8     [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
 ; CHECK-DAG:  ld.param.b8     [[E0:%rs[0-9]+]], [test_v5i1_param_0]
 ; CHECK:      .param .align 1 .b8 param0[1];
+; CHECK:      .param .align 1 .b8 retval0[1];
 ; CHECK-DAG:  st.param.b8     [param0], [[E0]];
 ; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
-; CHECK:      .param .align 1 .b8 retval0[1];
 ; CHECK:      call.uni (retval0), test_v5i1,
 ; CHECK-DAG:  ld.param.b8  [[RE0:%rs[0-9]+]], [retval0];
 ; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
@@ -131,8 +131,8 @@ define <5 x i1> @test_v5i1(<5 x i1> %a) {
 ; CHECK-NEXT: .param .b32 test_i2_param_0
 ; CHECK:      ld.param.b8 {{%rs[0-9]+}}, [test_i2_param_0];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i2,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
@@ -147,8 +147,8 @@ define i2 @test_i2(i2 %a) {
 ; CHECK-NEXT: .param .b32 test_i3_param_0
 ; CHECK:      ld.param.b8 {{%rs[0-9]+}}, [test_i3_param_0];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i3,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
@@ -163,10 +163,10 @@ define i3 @test_i3(i3 %a) {
 ; CHECK-LABEL: test_i8(
 ; CHECK-NEXT: .param .b32 test_i8_param_0
 ; CHECK:      ld.param.b8 [[A8:%rs[0-9]+]], [test_i8_param_0];
-; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], [[A32]];
 ; CHECK:      .param .b32 retval0;
+; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
+; CHECK:      st.param.b32    [param0], [[A32]];
 ; CHECK:      call.uni (retval0), test_i8,
 ; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R32]];
@@ -181,10 +181,10 @@ define i8 @test_i8(i8 %a) {
 ; CHECK-LABEL: test_i8s(
 ; CHECK-NEXT: .param .b32 test_i8s_param_0
 ; CHECK:      ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
-; CHECK:      cvt.s32.s16     [[A:%r[0-9]+]], [[A8]];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], [[A]];
 ; CHECK:      .param .b32 retval0;
+; CHECK:      cvt.s32.s16     [[A:%r[0-9]+]], [[A8]];
+; CHECK:      st.param.b32    [param0], [[A]];
 ; CHECK:      call.uni (retval0), test_i8s,
 ; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0];
 ; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
@@ -202,8 +202,8 @@ define signext i8 @test_i8s(i8 signext %a) {
 ; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
 ; CHECK:      ld.param.b32     [[R:%r[0-9]+]], [test_v3i8_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.b32  [param0], [[R]]
 ; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      st.param.b32  [param0], [[R]]
 ; CHECK:      call.uni (retval0), test_v3i8,
 ; CHECK:      ld.param.b32  [[RE:%r[0-9]+]], [retval0];
 ; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very
@@ -220,8 +220,8 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) {
 ; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
 ; CHECK:      ld.param.b32 [[R:%r[0-9]+]], [test_v4i8_param_0]
 ; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.b32  [param0], [[R]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      st.param.b32  [param0], [[R]];
 ; CHECK:      call.uni (retval0), test_v4i8,
 ; CHECK:      ld.param.b32  [[RET:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32  [func_retval0], [[RET]];
@@ -237,20 +237,13 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) {
 ; CHECK-DAG:  ld.param.b32    [[E0:%r[0-9]+]], [test_v5i8_param_0]
 ; CHECK-DAG:  ld.param.b8     [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
 ; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK-DAG:  st.param.v4.b8  [param0], 
-; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK-DAG:  st.param.b32  [param0], [[E0]];
+; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
 ; CHECK:      call.uni (retval0), test_v5i8,
-; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0];
+; CHECK-DAG:  ld.param.b32    [[RE0:%r[0-9]+]], [retval0];
 ; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG:  cvt.u32.u16     [[R3:%r[0-9]+]], [[RE3]];
-; CHECK-DAG:  cvt.u32.u16     [[R2:%r[0-9]+]], [[RE2]];
-; CHECK-DAG:  prmt.b32        [[P0:%r[0-9]+]], [[R2]], [[R3]], 0x3340U;
-; CHECK-DAG:  cvt.u32.u16     [[R1:%r[0-9]+]], [[RE1]];
-; CHECK-DAG:  cvt.u32.u16     [[R0:%r[0-9]+]], [[RE0]];
-; CHECK-DAG:  prmt.b32        [[P1:%r[0-9]+]], [[R0]], [[R1]], 0x3340U;
-; CHECK-DAG:  prmt.b32        [[P2:%r[0-9]+]], [[P1]], [[P0]], 0x5410U;
-; CHECK-DAG:  st.param.b32  [func_retval0], [[P2]];
+; CHECK-DAG:  st.param.b32  [func_retval0], [[RE0]];
 ; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
 ; CHECK-NEXT: ret;
 define <5 x i8> @test_v5i8(<5 x i8> %a) {
@@ -262,8 +255,8 @@ define <5 x i8> @test_v5i8(<5 x i8> %a) {
 ; CHECK-LABEL: test_i11(
 ; CHECK-NEXT: .param .b32 test_i11_param_0
 ; CHECK:      ld.param.b16    {{%rs[0-9]+}}, [test_i11_param_0];
-; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i11,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
@@ -277,10 +270,10 @@ define i11 @test_i11(i11 %a) {
 ; CHECK-LABEL: test_i16(
 ; CHECK-NEXT: .param .b32 test_i16_param_0
 ; CHECK:      ld.param.b16    [[E16:%rs[0-9]+]], [test_i16_param_0];
-; CHECK:      cvt.u32.u16     [[E32:%r[0-9]+]], [[E16]];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], [[E32]];
 ; CHECK:      .param .b32 retval0;
+; CHECK:      cvt.u32.u16     [[E32:%r[0-9]+]], [[E16]];
+; CHECK:      st.param.b32    [param0], [[E32]];
 ; CHECK:      call.uni (retval0), test_i16,
 ; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[RE32]];
@@ -294,10 +287,10 @@ define i16 @test_i16(i16 %a) {
 ; CHECK-LABEL: test_i16s(
 ; CHECK-NEXT: .param .b32 test_i16s_param_0
 ; CHECK:      ld.param.b16    [[E16:%rs[0-9]+]], [test_i16s_param_0];
-; CHECK:      cvt.s32.s16     [[E32:%r[0-9]+]], [[E16]];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], [[E32]];
 ; CHECK:      .param .b32 retval0;
+; CHECK:      cvt.s32.s16     [[E32:%r[0-9]+]], [[E16]];
+; CHECK:      st.param.b32    [param0], [[E32]];
 ; CHECK:      call.uni (retval0), test_i16s,
 ; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0];
 ; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[RE32]];
@@ -312,14 +305,15 @@ define signext i16 @test_i16s(i16 signext %a) {
 ; CHECK-LABEL: test_v3i16(
 ; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
 ; CHECK-DAG:  ld.param.b16      [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
-; CHECK-DAG:  ld.param.v2.b16   {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
+; CHECK-DAG:  ld.param.b32      [[E0:%r[0-9]+]], [test_v3i16_param_0];
 ; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.v2.b16 [param0], {[[E0]], [[E1]]};
-; CHECK:      st.param.b16    [param0+4], [[E2]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK-DAG:  st.param.b32    [param0], [[E0]];
+; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
 ; CHECK:      call.uni (retval0), test_v3i16,
-; CHECK:      ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0];
+; CHECK:      ld.param.b32 [[RE:%r[0-9]+]], [retval0];
 ; CHECK:      ld.param.b16    [[RE2:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  mov.b32       {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [[RE]];
 ; CHECK-DAG:  st.param.v2.b16 [func_retval0], {[[RE0]], [[RE1]]};
 ; CHECK-DAG:  st.param.b16    [func_retval0+4], [[RE2]];
 ; CHECK-NEXT: ret;
@@ -333,8 +327,8 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) {
 ; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
 ; CHECK:      ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0]
 ; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.v2.b32 [param0], {[[E0]], [[E1]]};
 ; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      st.param.v2.b32 [param0], {[[E0]], [[E1]]};
 ; CHECK:      call.uni (retval0), test_v4i16,
 ; CHECK:      ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}
@@ -348,15 +342,15 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: test_v5i16(
 ; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
 ; CHECK-DAG:  ld.param.b16    [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
-; CHECK-DAG:  ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
+; CHECK-DAG:  ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v5i16_param_0]
 ; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK-DAG:  st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
 ; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK-DAG:  st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
 ; CHECK:      call.uni (retval0), test_v5i16,
-; CHECK-DAG:  ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0];
+; CHECK-DAG:  ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b16    [[RE4:%rs[0-9]+]], [retval0+8];
-; CHECK-DAG:  st.param.v4.b16 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]}
 ; CHECK-DAG:  st.param.b16    [func_retval0+8], [[RE4]];
 ; CHECK-NEXT: ret;
 define <5 x i16> @test_v5i16(<5 x i16> %a) {
@@ -369,8 +363,8 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) {
 ; CHECK-NEXT: .param .align 2 .b8 test_f16_param_0[2]
 ; CHECK:      ld.param.b16    [[E:%rs[0-9]+]], [test_f16_param_0];
 ; CHECK:      .param .align 2 .b8 param0[2];
-; CHECK:      st.param.b16    [param0], [[E]];
 ; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      st.param.b16    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_f16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]]
@@ -385,8 +379,8 @@ define half @test_f16(half %a) {
 ; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
 ; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_v2f16_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_v2f16,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]]
@@ -401,8 +395,8 @@ define <2 x half> @test_v2f16(<2 x half> %a) {
 ; CHECK-NEXT: .param .align 2 .b8 test_bf16_param_0[2]
 ; CHECK:      ld.param.b16    [[E:%rs[0-9]+]], [test_bf16_param_0];
 ; CHECK:      .param .align 2 .b8 param0[2];
-; CHECK:      st.param.b16    [param0], [[E]];
 ; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      st.param.b16    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_bf16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]]
@@ -417,8 +411,8 @@ define bfloat @test_bf16(bfloat %a) {
 ; CHECK-NEXT: .param .align 4 .b8 test_v2bf16_param_0[4]
 ; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_v2bf16_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4];
-; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_v2bf16,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]]
@@ -432,15 +426,16 @@ define <2 x bfloat> @test_v2bf16(<2 x bfloat> %a) {
 ; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
 ; CHECK-LABEL: test_v3f16(
 ; CHECK:      .param .align 8 .b8 test_v3f16_param_0[8]
-; CHECK-DAG:  ld.param.v2.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3f16_param_0];
+; CHECK-DAG:  ld.param.b32    [[E0:%r[0-9]+]], [test_v3f16_param_0];
 ; CHECK-DAG:  ld.param.b16    [[E2:%rs[0-9]+]], [test_v3f16_param_0+4];
 ; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK-DAG:  st.param.v2.b16 [param0], {[[E0]], [[E1]]};
-; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK-DAG:  st.param.b32    [param0], [[E0]];
+; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
 ; CHECK:      call.uni (retval0),      test_v3f16,
-; CHECK-DAG:  ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0];
+; CHECK-DAG:  ld.param.b32 [[R:%r[0-9]+]], [retval0];
 ; CHECK-DAG:  ld.param.b16    [[R2:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  mov.b32       {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [[R]];
 ; CHECK-DAG:  st.param.v2.b16 [func_retval0], {[[R0]], [[R1]]};
 ; CHECK-DAG:  st.param.b16    [func_retval0+4], [[R2]];
 ; CHECK:      ret;
@@ -454,8 +449,8 @@ define <3 x half> @test_v3f16(<3 x half> %a) {
 ; CHECK:      .param .align 8 .b8 test_v4f16_param_0[8]
 ; CHECK:      ld.param.v2.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
 ; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.v2.b32 [param0], {[[R01]], [[R23]]};
 ; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      st.param.v2.b32 [param0], {[[R01]], [[R23]]};
 ; CHECK:      call.uni (retval0),      test_v4f16,
 ; CHECK:      ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v2.b32 [func_retval0], {[[RH01]], [[RH23]]};
@@ -468,16 +463,16 @@ define <4 x half> @test_v4f16(<4 x half> %a) {
 ; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
 ; CHECK-LABEL: test_v5f16(
 ; CHECK:      .param .align 16 .b8 test_v5f16_param_0[16]
-; CHECK-DAG:  ld.param.v4.b16  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5f16_param_0];
+; CHECK-DAG:  ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v5f16_param_0];
 ; CHECK-DAG:  ld.param.b16    [[E4:%rs[0-9]+]], [test_v5f16_param_0+8];
 ; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK-DAG:  st.param.v4.b16 [param0],
-; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
 ; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK-DAG:  st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
 ; CHECK:      call.uni (retval0),      test_v5f16,
-; CHECK-DAG:  ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0];
+; CHECK-DAG:  ld.param.v2.b32 {[[R0:%r[0-9]+]], [[R1:%r[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b16    [[R4:%rs[0-9]+]], [retval0+8];
-; CHECK-DAG:  st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG:  st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]};
 ; CHECK-DAG:  st.param.b16    [func_retval0+8], [[R4]];
 ; CHECK:      ret;
 define <5 x half> @test_v5f16(<5 x half> %a) {
@@ -490,8 +485,8 @@ define <5 x half> @test_v5f16(<5 x half> %a) {
 ; CHECK:      .param .align 16 .b8 test_v8f16_param_0[16]
 ; CHECK:      ld.param.v4.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
 ; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK:      st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]};
 ; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]};
 ; CHECK:      call.uni (retval0), test_v8f16,
 ; CHECK:      ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v4.b32 [func_retval0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
@@ -504,20 +499,20 @@ define <8 x half> @test_v8f16(<8 x half> %a) {
 ; CHECK:.func  (.param .align 32 .b8 func_retval0[32])
 ; CHECK-LABEL: test_v9f16(
 ; CHECK:      .param .align 32 .b8 test_v9f16_param_0[32]
-; CHECK-DAG:  ld.param.v4.b16  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v9f16_param_0];
-; CHECK-DAG:  ld.param.v4.b16  {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [test_v9f16_param_0+8];
+; CHECK-DAG:  ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v9f16_param_0];
+; CHECK-DAG:  ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v9f16_param_0+8];
 ; CHECK-DAG:  ld.param.b16     [[E8:%rs[0-9]+]], [test_v9f16_param_0+16];
 ; CHECK:      .param .align 32 .b8 param0[32];
-; CHECK-DAG:  st.param.v4.b16 [param0],
-; CHECK-DAG:  st.param.v4.b16 [param0+8],
-; CHECK-DAG:  st.param.b16    [param0+16], [[E8]];
 ; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK-DAG:  st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b16    [param0+16], [[E8]];
 ; CHECK:      call.uni (retval0), test_v9f16,
-; CHECK-DAG:  ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0];
-; CHECK-DAG:  ld.param.v4.b16 {[[R4:%rs[0-9]+]], [[R5:%rs[0-9]+]], [[R6:%rs[0-9]+]], [[R7:%rs[0-9]+]]}, [retval0+8];
+; CHECK-DAG:  ld.param.v2.b32 {[[R0:%r[0-9]+]], [[R1:%r[0-9]+]]}, [retval0];
+; CHECK-DAG:  ld.param.v2.b32 {[[R2:%r[0-9]+]], [[R3:%r[0-9]+]]}, [retval0+8];
 ; CHECK-DAG:  ld.param.b16    [[R8:%rs[0-9]+]], [retval0+16];
-; CHECK-DAG:  st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]};
-; CHECK-DAG:  st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
+; CHECK-DAG:  st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]};
+; CHECK-DAG:  st.param.v2.b32 [func_retval0+8], {[[R2]], [[R3]]};
 ; CHECK-DAG:  st.param.b16    [func_retval0+16], [[R8]];
 ; CHECK:      ret;
 define <9 x half> @test_v9f16(<9 x half> %a) {
@@ -531,8 +526,8 @@ define <9 x half> @test_v9f16(<9 x half> %a) {
 ; CHECK-DAG:  ld.param.b16    {{%r[0-9]+}}, [test_i19_param_0];
 ; CHECK-DAG:  ld.param.b8     {{%r[0-9]+}}, [test_i19_param_0+2];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i19,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
@@ -548,8 +543,8 @@ define i19 @test_i19(i19 %a) {
 ; CHECK-DAG:  ld.param.b16    {{%r[0-9]+}}, [test_i23_param_0];
 ; CHECK-DAG:  ld.param.b8     {{%r[0-9]+}}, [test_i23_param_0+2];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i23,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
@@ -565,8 +560,8 @@ define i23 @test_i23(i23 %a) {
 ; CHECK-DAG:  ld.param.b8     {{%r[0-9]+}}, [test_i24_param_0+2];
 ; CHECK-DAG:  ld.param.b16    {{%r[0-9]+}}, [test_i24_param_0];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i24,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
@@ -581,8 +576,8 @@ define i24 @test_i24(i24 %a) {
 ; CHECK-NEXT: .param .b32 test_i29_param_0
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [test_i29_param_0];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i29,
 ; CHECK:      ld.param.b32    {{%r[0-9]+}}, [retval0];
 ; CHECK:      st.param.b32    [func_retval0], {{%r[0-9]+}};
@@ -597,8 +592,8 @@ define i29 @test_i29(i29 %a) {
 ; CHECK-NEXT: .param .b32 test_i32_param_0
 ; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_i32_param_0];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_i32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -613,10 +608,10 @@ define i32 @test_i32(i32 %a) {
 ; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
 ; CHECK-DAG:  ld.param.b32     [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
 ; CHECK-DAG:  ld.param.v2.b32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
-; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK:      st.param.v2.b32  [param0], {[[E0]], [[E1]]};
-; CHECK:      st.param.b32     [param0+8], [[E2]];
-; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK-DAG:  .param .align 16 .b8 param0[16];
+; CHECK-DAG:  .param .align 16 .b8 retval0[16];
+; CHECK-DAG:  st.param.v2.b32  [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b32     [param0+8], [[E2]];
 ; CHECK:      call.uni (retval0), test_v3i32,
 ; CHECK:      ld.param.v2.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:      ld.param.b32     [[RE2:%r[0-9]+]], [retval0+8];
@@ -632,9 +627,9 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) {
 ; CHECK-LABEL: test_v4i32(
 ; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
 ; CHECK:      ld.param.v4.b32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
-; CHECK:      .param .align 16 .b8 param0[16];
-; CHECK:      st.param.v4.b32  [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK-DAG:  .param .align 16 .b8 param0[16];
+; CHECK-DAG:  .param .align 16 .b8 retval0[16];
+; CHECK-DAG:  st.param.v4.b32  [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK:      call.uni (retval0), test_v4i32,
 ; CHECK:      ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0];
 ; CHECK:      st.param.v4.b32  [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
@@ -650,9 +645,9 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) {
 ; CHECK-DAG:  ld.param.b32     [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
 ; CHECK-DAG:  ld.param.v4.b32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
 ; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK:      .param .align 32 .b8 retval0[32];
 ; CHECK-DAG:  st.param.v4.b32  [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK-DAG:  st.param.b32     [param0+16], [[E4]];
-; CHECK:      .param .align 32 .b8 retval0[32];
 ; CHECK:      call.uni (retval0), test_v5i32,
 ; CHECK-DAG:  ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0];
 ; CHECK-DAG:  ld.param.b32     [[RE4:%r[0-9]+]], [retval0+16];
@@ -669,8 +664,8 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) {
 ; CHECK-NEXT: .param .b32 test_f32_param_0
 ; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_f32_param_0];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .b32 retval0;
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_f32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -686,8 +681,8 @@ define float @test_f32(float %a) {
 ; CHECK-DAG:  ld.param.b8    {{%rd[0-9]+}}, [test_i40_param_0+4];
 ; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i40_param_0];
 ; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
+; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i40,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
@@ -703,8 +698,8 @@ define i40 @test_i40(i40 %a) {
 ; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i47_param_0+4];
 ; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i47_param_0];
 ; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
+; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i47,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
@@ -720,8 +715,8 @@ define i47 @test_i47(i47 %a) {
 ; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i48_param_0+4];
 ; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i48_param_0];
 ; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
+; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i48,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
@@ -738,8 +733,8 @@ define i48 @test_i48(i48 %a) {
 ; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i51_param_0+4];
 ; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i51_param_0];
 ; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
+; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i51,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
@@ -756,8 +751,8 @@ define i51 @test_i51(i51 %a) {
 ; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i56_param_0+4];
 ; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i56_param_0];
 ; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
+; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i56,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
@@ -772,8 +767,8 @@ define i56 @test_i56(i56 %a) {
 ; CHECK-NEXT: .param .b64 test_i57_param_0
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [test_i57_param_0];
 ; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
+; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      call.uni (retval0), test_i57,
 ; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [retval0];
 ; CHECK:      st.param.b64    [func_retval0], {{%rd[0-9]+}};
@@ -788,8 +783,8 @@ define i57 @test_i57(i57 %a) {
 ; CHECK-NEXT: .param .b64 test_i64_param_0
 ; CHECK:      ld.param.b64    [[E:%rd[0-9]+]], [test_i64_param_0];
 ; CHECK:      .param .b64 param0;
-; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      .param .b64 retval0;
+; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_i64,
 ; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0];
 ; CHECK:      st.param.b64    [func_retval0], [[R]];
@@ -805,9 +800,9 @@ define i64 @test_i64(i64 %a) {
 ; CHECK-DAG:  ld.param.b64     [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
 ; CHECK-DAG:  ld.param.v2.b64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
 ; CHECK:      .param .align 32 .b8 param0[32];
-; CHECK:      st.param.v2.b64  [param0], {[[E0]], [[E1]]};
-; CHECK:      st.param.b64     [param0+16], [[E2]];
 ; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK-DAG:  st.param.v2.b64  [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b64     [param0+16], [[E2]];
 ; CHECK:      call.uni (retval0), test_v3i64,
 ; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0];
 ; CHECK:      ld.param.b64     [[RE2:%rd[0-9]+]], [retval0+16];
@@ -828,9 +823,9 @@ define <3 x i64> @test_v3i64(<3 x i64> %a) {
 ; CHECK-DAG:  ld.param.v2.b64  {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
 ; CHECK-DAG:  ld.param.v2.b64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
 ; CHECK:      .param .align 32 .b8 param0[32];
-; CHECK:      st.param.v2.b64  [param0], {[[E0]], [[E1]]};
-; CHECK:      st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
 ; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK-DAG:  st.param.v2.b64  [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
 ; CHECK:      call.uni (retval0), test_v4i64,
 ; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0];
 ; CHECK:      ld.param.v2.b64  {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
@@ -849,8 +844,8 @@ define <4 x i64> @test_v4i64(<4 x i64> %a) {
 ; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
 ; CHECK:      ld.param.b8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
 ; CHECK:      .param .align 1 .b8 param0[1];
-; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      .param .align 1 .b8 retval0[1];
+; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      call.uni (retval0), test_s_i1,
 ; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b8    [func_retval0], [[R]];
@@ -865,8 +860,8 @@ define %s_i1 @test_s_i1(%s_i1 %a) {
 ; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
 ; CHECK:      ld.param.b8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
 ; CHECK:      .param .align 1 .b8 param0[1];
-; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      .param .align 1 .b8 retval0[1];
+; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      call.uni (retval0), test_s_i8,
 ; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b8    [func_retval0], [[R]];
@@ -881,8 +876,8 @@ define %s_i8 @test_s_i8(%s_i8 %a) {
 ; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
 ; CHECK:      ld.param.b16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
 ; CHECK:      .param .align 2 .b8 param0[2];
-; CHECK:      st.param.b16    [param0], [[A]]
 ; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      st.param.b16    [param0], [[A]]
 ; CHECK:      call.uni (retval0), test_s_i16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
@@ -897,8 +892,8 @@ define %s_i16 @test_s_i16(%s_i16 %a) {
 ; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
 ; CHECK:      ld.param.b16 [[A:%rs[0-9]+]], [test_s_f16_param_0];
 ; CHECK:      .param .align 2 .b8 param0[2];
-; CHECK:      st.param.b16    [param0], [[A]]
 ; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      st.param.b16    [param0], [[A]]
 ; CHECK:      call.uni (retval0), test_s_f16,
 ; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
@@ -913,8 +908,8 @@ define %s_f16 @test_s_f16(%s_f16 %a) {
 ; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
 ; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_s_i32_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4]
-; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_s_i32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -929,8 +924,8 @@ define %s_i32 @test_s_i32(%s_i32 %a) {
 ; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
 ; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_s_f32_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4]
-; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_s_f32,
 ; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0];
 ; CHECK:      st.param.b32    [func_retval0], [[R]];
@@ -945,8 +940,8 @@ define %s_f32 @test_s_f32(%s_f32 %a) {
 ; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
 ; CHECK:      ld.param.b64    [[E:%rd[0-9]+]], [test_s_i64_param_0];
 ; CHECK:      .param .align 8 .b8 param0[8];
-; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      call.uni (retval0), test_s_i64,
 ; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0];
 ; CHECK:      st.param.b64    [func_retval0], [[R]];
@@ -966,12 +961,12 @@ define %s_i64 @test_s_i64(%s_i64 %a) {
 ; CHECK-DAG:    ld.param.b32    [[E1:%r[0-9]+]], [test_s_i32f32_param_0+4];
 ; CHECK-DAG:    ld.param.b32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
 ; CHECK:        .param .align 8 .b8 param0[24];
+; CHECK:        .param .align 8 .b8 retval0[24];
 ; CHECK-DAG:    st.param.b32    [param0], [[E0]];
 ; CHECK-DAG:    st.param.b32    [param0+4], [[E1]];
 ; CHECK-DAG:    st.param.b32    [param0+8], [[E2]];
 ; CHECK-DAG:    st.param.b32    [param0+12], [[E3]];
 ; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
-; CHECK:        .param .align 8 .b8 retval0[24];
 ; CHECK:        call.uni (retval0), test_s_i32f32,
 ; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0];
 ; CHECK-DAG:    ld.param.b32    [[RE1:%r[0-9]+]], [retval0+4];
@@ -997,10 +992,10 @@ define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
 ; CHECK-DAG:    ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
 ; CHECK-DAG:    ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
 ; CHECK:        .param .align 8 .b8 param0[24];
-; CHECK:        st.param.v2.b32 [param0], {[[E0]], [[E1]]};
-; CHECK:        st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
-; CHECK:        st.param.b64    [param0+16], [[E4]];
 ; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-DAG:    st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:    st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
 ; CHECK:        call.uni (retval0), test_s_i32x4,
 ; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:        ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
@@ -1024,16 +1019,13 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
 ; CHECK:        ld.param.b8     [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
 ; CHECK:        ld.param.v2.b32         {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
 ; CHECK:        .param .align 8 .b8 param0[32];
-; CHECK:        st.param.v2.b32 [param0], {[[E0]], [[E1]]};
-; CHECK:        st.param.b8     [param0+8], [[E2]];
-; CHECK:        st.param.b32    [param0+12], [[E3]];
-; CHECK:        st.param.b32    [param0+16], [[E4]];
-; CHECK:        st.param.b64    [param0+24], [[E5]];
 ; CHECK:        .param .align 8 .b8 retval0[32];
-; CHECK:        call.uni (retval0), test_s_i1i32x4,
-; CHECK:        (
-; CHECK:        param0
-; CHECK:        );
+; CHECK-DAG:  st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b8     [param0+8], [[E2]];
+; CHECK-DAG:  st.param.b32    [param0+12], [[E3]];
+; CHECK-DAG:  st.param.b32    [param0+16], [[E4]];
+; CHECK-DAG:  st.param.b64    [param0+24], [[E5]];
+; CHECK:        call.uni (retval0), test_s_i1i32x4, (param0);
 ; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:        ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+8];
 ; CHECK:        ld.param.b32    [[RE3:%r[0-9]+]], [retval0+12];
@@ -1082,6 +1074,7 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
 ; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0];
 ; CHECK:        .param .align 1 .b8 param0[25];
+; CHECK:        .param .align 1 .b8 retval0[25];
 ; CHECK-DAG:        st.param.b8     [param0],
 ; CHECK-DAG:        st.param.b8     [param0+1],
 ; CHECK-DAG:        st.param.b8     [param0+2],
@@ -1107,33 +1100,32 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK-DAG:        st.param.b8     [param0+22],
 ; CHECK-DAG:        st.param.b8     [param0+23],
 ; CHECK-DAG:        st.param.b8     [param0+24],
-; CHECK:            .param .align 1 .b8 retval0[25];
-; CHECK:            call.uni (retval0), test_s_i1i32x4p,
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+3];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+4];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+5];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+6];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+7];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+9];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+10];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+11];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+12];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+13];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+14];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+15];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+16];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+17];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+18];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+19];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+20];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+21];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+22];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+23];
-; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+24];
+; CHECK:            call.uni (retval0), test_s_i1i32x4p, (param0);
+; CHECK-DAG:        ld.param.b8     %rs{{[0-9]+}}, [retval0+8];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+3];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+2];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+1];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+7];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+6];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+5];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+4];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+12];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+11];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+10];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+9];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+16];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+15];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+14];
+; CHECK-DAG:        ld.param.b8     %r{{[0-9]+}}, [retval0+13];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+24];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+23];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+22];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+21];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+20];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+19];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+18];
+; CHECK-DAG:        ld.param.b8     %rd{{[0-9]+}}, [retval0+17];
 ; CHECK:            } // callseq
 ; CHECK-DAG:        st.param.b8     [func_retval0],
 ; CHECK-DAG:        st.param.b8     [func_retval0+1],
@@ -1177,13 +1169,13 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
 ; CHECK:        ld.param.b32    [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
 ; CHECK:        ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
 ; CHECK:        .param .align 16 .b8 param0[80];
-; CHECK:        st.param.v2.b32 [param0], {[[E0]], [[E1]]};
-; CHECK:        st.param.b32    [param0+8], [[E2]];
-; CHECK:        st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
-; CHECK:        st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
-; CHECK:        st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
-; CHECK:        st.param.b32    [param0+64], [[E15]];
 ; CHECK:        .param .align 16 .b8 retval0[80];
+; CHECK-DAG:    st.param.v2.b32 [param0], {[[E0]], [[E1]]};
+; CHECK-DAG:    st.param.b32    [param0+8], [[E2]];
+; CHECK-DAG:    st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
+; CHECK-DAG:    st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
+; CHECK-DAG:    st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
+; CHECK-DAG:    st.param.b32    [param0+64], [[E15]];
 ; CHECK:        call.uni (retval0), test_s_crossfield,
 ; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0];
 ; CHECK:        ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll
index 88ad0b0..2155fb4 100644
--- a/llvm/test/CodeGen/NVPTX/param-overalign.ll
+++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll
@@ -28,8 +28,8 @@ define float @caller_md(float %a, float %b) {
 ; CHECK-NEXT:    ld.param.b32 %r2, [caller_md_param_1];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    call.uni (retval0), callee_md, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 0
@@ -69,8 +69,8 @@ define float @caller(float %a, float %b) {
 ; CHECK-NEXT:    ld.param.b32 %r2, [caller_param_1];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
 ; CHECK-NEXT:    call.uni (retval0), callee, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
 ; CHECK-NEXT:    } // callseq 1
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index a480984a..a592b82 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -84,8 +84,8 @@ define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x1_param_1
   ; CHECK:       )
   ; CHECK:       .param .b32 param0;
-  ; CHECK:       st.param.b32 [param0], {{%r[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[4];
+  ; CHECK:       st.param.b32 [param0], {{%r[0-9]+}};
   ; CHECK:       call.uni (retval0), callee_St4x1, (param0);
   ; CHECK:       ld.param.b32 {{%r[0-9]+}}, [retval0];
   %1 = load i32, ptr %in, align 4
@@ -112,8 +112,8 @@ define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x2_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[8];
-  ; CHECK:       st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[8];
+  ; CHECK:       st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       call.uni (retval0), callee_St4x2, (param0);
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   %agg.tmp = alloca %struct.St4x2, align 8
@@ -149,9 +149,9 @@ define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x3_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[12];
+  ; CHECK:       .param .align 16 .b8 retval0[12];
   ; CHECK:       st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.b32    [param0+8], {{%r[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[12];
   ; CHECK:       call.uni (retval0), callee_St4x3, (param0);
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+8];
@@ -193,8 +193,8 @@ define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x4_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[16];
-  ; CHECK:       st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[16];
+  ; CHECK:       st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       call.uni (retval0), callee_St4x4, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   %call = tail call fastcc [4 x i32] @callee_St4x4(ptr noundef nonnull byval(%struct.St4x4) align 4 %in) #2
@@ -239,9 +239,9 @@ define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x5_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[20];
+  ; CHECK:       .param .align 16 .b8 retval0[20];
   ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.b32    [param0+16], {{%r[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[20];
   ; CHECK:       call.uni (retval0), callee_St4x5, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.b32    {{%r[0-9]+}},  [retval0+16];
@@ -295,9 +295,9 @@ define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x6_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[24];
+  ; CHECK:       .param .align 16 .b8 retval0[24];
   ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       .param .align 16 .b8 retval0[24];
   ; CHECK:       call.uni (retval0), callee_St4x6, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
@@ -357,10 +357,10 @@ define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x7_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[28];
+  ; CHECK:       .param .align 16 .b8 retval0[28];
   ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       st.param.b32    [param0+24], {{%r[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[28];
   ; CHECK:       call.uni (retval0), callee_St4x7, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
@@ -429,9 +429,9 @@ define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St4x8_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[32];
-  ; CHECK:       st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
-  ; CHECK:       st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[32];
+  ; CHECK-DAG:   st.param.v4.b32 [param0],  {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
+  ; CHECK-DAG:   st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
   ; CHECK:       call.uni (retval0), callee_St4x8, (param0);
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
@@ -503,8 +503,8 @@ define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St8x1_param_1
   ; CHECK:       )
   ; CHECK:       .param .b64 param0;
-  ; CHECK:       st.param.b64 [param0], {{%rd[0-9]+}};
   ; CHECK:       .param .align 16 .b8 retval0[8];
+  ; CHECK:       st.param.b64 [param0], {{%rd[0-9]+}};
   ; CHECK:       call.uni (retval0), callee_St8x1, (param0);
   ; CHECK:       ld.param.b64 {{%rd[0-9]+}}, [retval0];
   %1 = load i64, ptr %in, align 8
@@ -531,8 +531,8 @@ define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St8x2_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[16];
-  ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[16];
+  ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       call.uni (retval0), callee_St8x2, (param0);
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
   %call = tail call fastcc [2 x i64] @callee_St8x2(ptr noundef nonnull byval(%struct.St8x2) align 8 %in) #2
@@ -565,9 +565,9 @@ define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St8x3_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[24];
+  ; CHECK:       .param .align 16 .b8 retval0[24];
   ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       st.param.b64    [param0+16], {{%rd[0-9]+}};
-  ; CHECK:       .param .align 16 .b8 retval0[24];
   ; CHECK:       call.uni (retval0), callee_St8x3, (param0);
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.b64    {{%rd[0-9]+}}, [retval0+16];
@@ -609,9 +609,9 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
   ; CHECK:               .param .b64 caller_St8x4_param_1
   ; CHECK:       )
   ; CHECK:       .param .align 16 .b8 param0[32];
-  ; CHECK:       st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
-  ; CHECK:       st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       .param .align 16 .b8 retval0[32];
+  ; CHECK-DAG:   st.param.v2.b64 [param0],  {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
+  ; CHECK-DAG:   st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
   ; CHECK:       call.uni (retval0), callee_St8x4, (param0);
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
   ; CHECK:       ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16];
diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
index 5d0d6f6..4a53152 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
@@ -77,7 +77,7 @@ constants:       []
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    %0:b32, %1:b32, %2:b32, %3:b32 = LoadParamMemV4I32 0
+    %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, &retval0, 0 :: (load (s128), addrspace 101)
     ; CHECK-NOT: ProxyReg
     %4:b32 = ProxyRegB32 killed %0
     %5:b32 = ProxyRegB32 killed %1
@@ -86,7 +86,7 @@ body:             |
     ; CHECK: STV_i32_v4 killed %0, killed %1, killed %2, killed %3
     STV_i32_v4 killed %4, killed %5, killed %6, killed %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s128), addrspace 101)
 
-    %8:b32 = LoadParamMemI32 0
+    %8:b32 = LD_i32 0, 0, 101, 3, 32, &retval0, 0 :: (load (s32), addrspace 101)
     ; CHECK-NOT: ProxyReg
     %9:b32 = ProxyRegB32 killed %8
     %10:b32 = ProxyRegB32 killed %9
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index 6aa1119..f90435a 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -26,8 +26,8 @@ define void @st_param_i8_i16() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[4];
-; CHECK-NEXT:    st.param.b8 [param0], 1;
 ; CHECK-NEXT:    st.param.b16 [param0+2], 2;
+; CHECK-NEXT:    st.param.b8 [param0], 1;
 ; CHECK-NEXT:    call.uni call_i8_i16, (param0);
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    ret;
@@ -75,7 +75,7 @@ define void @st_param_f32() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 3, 0
 ; CHECK-NEXT:    .param .b32 param0;
-; CHECK-NEXT:    st.param.b32 [param0], 0f40A00000;
+; CHECK-NEXT:    st.param.b32 [param0], 1084227584;
 ; CHECK-NEXT:    call.uni call_f32, (param0);
 ; CHECK-NEXT:    } // callseq 3
 ; CHECK-NEXT:    ret;
@@ -91,7 +91,7 @@ define void @st_param_f64() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 4, 0
 ; CHECK-NEXT:    .param .b64 param0;
-; CHECK-NEXT:    st.param.b64 [param0], 0d4018000000000000;
+; CHECK-NEXT:    st.param.b64 [param0], 4618441417868443648;
 ; CHECK-NEXT:    call.uni call_f64, (param0);
 ; CHECK-NEXT:    } // callseq 4
 ; CHECK-NEXT:    ret;
@@ -165,7 +165,7 @@ define void @st_param_v2_i16_ii() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 8, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v2.b16 [param0], {1, 2};
+; CHECK-NEXT:    st.param.b32 [param0], 131073;
 ; CHECK-NEXT:    call.uni call_v2_i16, (param0);
 ; CHECK-NEXT:    } // callseq 8
 ; CHECK-NEXT:    ret;
@@ -432,7 +432,7 @@ define void @st_param_v4_i8_iiii() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 23, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, 3, 4};
+; CHECK-NEXT:    st.param.b32 [param0], 67305985;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 23
 ; CHECK-NEXT:    ret;
@@ -442,15 +442,18 @@ define void @st_param_v4_i8_iiii() {
 define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
 ; CHECK-LABEL: st_param_v4_i8_irrr(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_2];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_irrr_param_2];
+; CHECK-NEXT:    ld.param.b8 %r2, [st_param_v4_i8_irrr_param_1];
+; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r4, [st_param_v4_i8_irrr_param_0];
+; CHECK-NEXT:    prmt.b32 %r5, 1, %r4, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x5410U;
 ; CHECK-NEXT:    { // callseq 24, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs3, %rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [param0], %r6;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 24
 ; CHECK-NEXT:    ret;
@@ -464,15 +467,18 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
 define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
 ; CHECK-LABEL: st_param_v4_i8_rirr(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_2];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_rirr_param_2];
+; CHECK-NEXT:    ld.param.b8 %r2, [st_param_v4_i8_rirr_param_1];
+; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r4, [st_param_v4_i8_rirr_param_0];
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 2, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x5410U;
 ; CHECK-NEXT:    { // callseq 25, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs3, 2, %rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [param0], %r6;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 25
 ; CHECK-NEXT:    ret;
@@ -486,15 +492,18 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
 define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
 ; CHECK-LABEL: st_param_v4_i8_rrir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_2];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_rrir_param_1];
+; CHECK-NEXT:    ld.param.b8 %r2, [st_param_v4_i8_rrir_param_0];
+; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r4, [st_param_v4_i8_rrir_param_2];
+; CHECK-NEXT:    prmt.b32 %r5, 3, %r4, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r6, %r3, %r5, 0x5410U;
 ; CHECK-NEXT:    { // callseq 26, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs3, %rs2, 3, %rs1};
+; CHECK-NEXT:    st.param.b32 [param0], %r6;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 26
 ; CHECK-NEXT:    ret;
@@ -508,15 +517,18 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
 define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: st_param_v4_i8_rrri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_2];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_rrri_param_1];
+; CHECK-NEXT:    ld.param.b8 %r2, [st_param_v4_i8_rrri_param_0];
+; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r4, [st_param_v4_i8_rrri_param_2];
+; CHECK-NEXT:    prmt.b32 %r5, %r4, 4, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r6, %r3, %r5, 0x5410U;
 ; CHECK-NEXT:    { // callseq 27, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs3, %rs2, %rs1, 4};
+; CHECK-NEXT:    st.param.b32 [param0], %r6;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 27
 ; CHECK-NEXT:    ret;
@@ -530,14 +542,16 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
 define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
 ; CHECK-LABEL: st_param_v4_i8_iirr(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_iirr_param_1];
+; CHECK-NEXT:    ld.param.b8 %r2, [st_param_v4_i8_iirr_param_0];
+; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r4, 513, %r3, 0x5410U;
 ; CHECK-NEXT:    { // callseq 28, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs2, %rs1};
+; CHECK-NEXT:    st.param.b32 [param0], %r4;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 28
 ; CHECK-NEXT:    ret;
@@ -551,14 +565,17 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
 define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
 ; CHECK-LABEL: st_param_v4_i8_irir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irir_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irir_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_irir_param_1];
+; CHECK-NEXT:    prmt.b32 %r2, 3, %r1, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r3, [st_param_v4_i8_irir_param_0];
+; CHECK-NEXT:    prmt.b32 %r4, 1, %r3, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
 ; CHECK-NEXT:    { // callseq 29, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs2, 3, %rs1};
+; CHECK-NEXT:    st.param.b32 [param0], %r5;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 29
 ; CHECK-NEXT:    ret;
@@ -572,14 +589,17 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
 define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
 ; CHECK-LABEL: st_param_v4_i8_irri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irri_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irri_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_irri_param_1];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 4, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r3, [st_param_v4_i8_irri_param_0];
+; CHECK-NEXT:    prmt.b32 %r4, 1, %r3, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
 ; CHECK-NEXT:    { // callseq 30, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs2, %rs1, 4};
+; CHECK-NEXT:    st.param.b32 [param0], %r5;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 30
 ; CHECK-NEXT:    ret;
@@ -593,14 +613,17 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
 define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
 ; CHECK-LABEL: st_param_v4_i8_riir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riir_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riir_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_riir_param_1];
+; CHECK-NEXT:    prmt.b32 %r2, 3, %r1, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r3, [st_param_v4_i8_riir_param_0];
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 2, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
 ; CHECK-NEXT:    { // callseq 31, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs2, 2, 3, %rs1};
+; CHECK-NEXT:    st.param.b32 [param0], %r5;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 31
 ; CHECK-NEXT:    ret;
@@ -614,14 +637,17 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
 define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
 ; CHECK-LABEL: st_param_v4_i8_riri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riri_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riri_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_riri_param_1];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 4, 0x3340U;
+; CHECK-NEXT:    ld.param.b8 %r3, [st_param_v4_i8_riri_param_0];
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 2, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
 ; CHECK-NEXT:    { // callseq 32, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs2, 2, %rs1, 4};
+; CHECK-NEXT:    st.param.b32 [param0], %r5;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 32
 ; CHECK-NEXT:    ret;
@@ -635,14 +661,16 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
 define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
 ; CHECK-LABEL: st_param_v4_i8_rrii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_rrii_param_1];
+; CHECK-NEXT:    ld.param.b8 %r2, [st_param_v4_i8_rrii_param_0];
+; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 1027, 0x5410U;
 ; CHECK-NEXT:    { // callseq 33, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs2, %rs1, 3, 4};
+; CHECK-NEXT:    st.param.b32 [param0], %r4;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 33
 ; CHECK-NEXT:    ret;
@@ -656,13 +684,15 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
 define void @st_param_v4_i8_iiir(i8 %d) {
 ; CHECK-LABEL: st_param_v4_i8_iiir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 34, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, 3, %rs1};
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_iiir_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, 3, %r1, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r3, 513, %r2, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [param0], %r3;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 34
 ; CHECK-NEXT:    ret;
@@ -676,13 +706,15 @@ define void @st_param_v4_i8_iiir(i8 %d) {
 define void @st_param_v4_i8_iiri(i8 %c) {
 ; CHECK-LABEL: st_param_v4_i8_iiri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 35, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs1, 4};
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_iiri_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 4, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r3, 513, %r2, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [param0], %r3;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 35
 ; CHECK-NEXT:    ret;
@@ -696,13 +728,15 @@ define void @st_param_v4_i8_iiri(i8 %c) {
 define void @st_param_v4_i8_irii(i8 %b) {
 ; CHECK-LABEL: st_param_v4_i8_irii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irii_param_0];
 ; CHECK-NEXT:    { // callseq 36, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, 3, 4};
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_irii_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, 1, %r1, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 1027, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [param0], %r3;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 36
 ; CHECK-NEXT:    ret;
@@ -716,13 +750,15 @@ define void @st_param_v4_i8_irii(i8 %b) {
 define void @st_param_v4_i8_riii(i8 %a) {
 ; CHECK-LABEL: st_param_v4_i8_riii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riii_param_0];
 ; CHECK-NEXT:    { // callseq 37, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, 3, 4};
+; CHECK-NEXT:    ld.param.b8 %r1, [st_param_v4_i8_riii_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 2, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 1027, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [param0], %r3;
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 37
 ; CHECK-NEXT:    ret;
@@ -742,7 +778,7 @@ define void @st_param_v4_i16_iiii() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 38, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, 3, 4};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {131073, 262147};
 ; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 38
 ; CHECK-NEXT:    ret;
@@ -841,13 +877,15 @@ define void @st_param_v4_i16_iirr(i16 %c, i16 %d) {
 ; CHECK-LABEL: st_param_v4_i16_iirr(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_iirr_param_0];
 ; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_iirr_param_1];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; CHECK-NEXT:    { // callseq 43, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, %rs1, %rs2};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {131073, %r1};
 ; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 43
 ; CHECK-NEXT:    ret;
@@ -946,13 +984,15 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) {
 ; CHECK-LABEL: st_param_v4_i16_rrii(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_rrii_param_0];
 ; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_rrii_param_1];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; CHECK-NEXT:    { // callseq 48, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, %rs2, 3, 4};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, 262147};
 ; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 48
 ; CHECK-NEXT:    ret;
@@ -966,13 +1006,16 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) {
 define void @st_param_v4_i16_iiir(i16 %d) {
 ; CHECK-LABEL: st_param_v4_i16_iiir(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_iiir_param_0];
+; CHECK-NEXT:    mov.b16 %rs2, 3;
+; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
 ; CHECK-NEXT:    { // callseq 49, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, 3, %rs1};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {131073, %r1};
 ; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 49
 ; CHECK-NEXT:    ret;
@@ -986,13 +1029,16 @@ define void @st_param_v4_i16_iiir(i16 %d) {
 define void @st_param_v4_i16_iiri(i16 %c) {
 ; CHECK-LABEL: st_param_v4_i16_iiri(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_iiri_param_0];
+; CHECK-NEXT:    mov.b16 %rs2, 4;
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; CHECK-NEXT:    { // callseq 50, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, %rs1, 4};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {131073, %r1};
 ; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 50
 ; CHECK-NEXT:    ret;
@@ -1006,13 +1052,16 @@ define void @st_param_v4_i16_iiri(i16 %c) {
 define void @st_param_v4_i16_irii(i16 %b) {
 ; CHECK-LABEL: st_param_v4_i16_irii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_irii_param_0];
+; CHECK-NEXT:    mov.b16 %rs2, 1;
+; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
 ; CHECK-NEXT:    { // callseq 51, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, 3, 4};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, 262147};
 ; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 51
 ; CHECK-NEXT:    ret;
@@ -1026,13 +1075,16 @@ define void @st_param_v4_i16_irii(i16 %b) {
 define void @st_param_v4_i16_riii(i16 %a) {
 ; CHECK-LABEL: st_param_v4_i16_riii(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_riii_param_0];
+; CHECK-NEXT:    mov.b16 %rs2, 2;
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; CHECK-NEXT:    { // callseq 52, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, 3, 4};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, 262147};
 ; CHECK-NEXT:    call.uni call_v4_i16, (param0);
 ; CHECK-NEXT:    } // callseq 52
 ; CHECK-NEXT:    ret;
@@ -1672,13 +1724,12 @@ declare void @call_v4_f32(%struct.float4 alignstack(16))
 define void @st_param_bfloat() {
 ; CHECK-LABEL: st_param_bfloat(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-EMPTY:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.b16 %rs1, 0x4100;
 ; CHECK-NEXT:    { // callseq 83, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
-; CHECK-NEXT:    st.param.b16 [param0], %rs1;
+; CHECK-NEXT:    st.param.b16 [param0], 0x4100;
 ; CHECK-NEXT:    call.uni call_bfloat, (param0);
 ; CHECK-NEXT:    } // callseq 83
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index 5b31b5e..c8ca6b6 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -34,9 +34,9 @@ define void @test_store_param_def(i64 %param0, i32 %param1) {
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_store_param_def_param_1];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[32];
+; CHECK-NEXT:    st.param.v4.b32 [param0+16], {%r2, %r1, %r3, %r4};
+; CHECK-NEXT:    st.param.v2.b32 [param0+8], {%r5, %r1};
 ; CHECK-NEXT:    st.param.b64 [param0], %rd1;
-; CHECK-NEXT:    st.param.v2.b32 [param0+8], {%r2, %r1};
-; CHECK-NEXT:    st.param.v4.b32 [param0+16], {%r3, %r1, %r4, %r5};
 ; CHECK-NEXT:    call.uni test_call, (param0);
 ; CHECK-NEXT:    } // callseq 1
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index d6961a9..3138d7c 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -69,8 +69,8 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    tex.1d.v4.f32.s32 {%r2, %r3, %r4, %r5}, [tex0, {%r1}];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b64 param0;
-; CHECK-NEXT:    st.param.b64 [param0], %rd3;
 ; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    st.param.b64 [param0], %rd3;
 ; CHECK-NEXT:    call.uni (retval0), texfunc, (param0);
 ; CHECK-NEXT:    ld.param.b32 %r6, [retval0];
 ; CHECK-NEXT:    } // callseq 0
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index 87e46b1..697eb90 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; Verifies correctness of load/store of parameters and return values.
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | %ptxas-verify %}
 
 %s_i8i16p = type { <{ i16, i8, i16 }>, i64 }
 %s_i8i32p = type { <{ i32, i8, i32 }>, i64 }
@@ -24,37 +24,35 @@
 define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
 ; CHECK-LABEL: test_s_i8i16p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<15>;
+; CHECK-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs4, [test_s_i8i16p_param_0+4];
-; CHECK-NEXT:    shl.b16 %rs5, %rs4, 8;
-; CHECK-NEXT:    ld.param.b8 %rs6, [test_s_i8i16p_param_0+3];
-; CHECK-NEXT:    or.b16 %rs3, %rs5, %rs6;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8i16p_param_0];
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8i16p_param_0+8];
-; CHECK-NEXT:    ld.param.b8 %rs2, [test_s_i8i16p_param_0+2];
-; CHECK-NEXT:    ld.param.b16 %rs1, [test_s_i8i16p_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_s_i8i16p_param_0+4];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[16];
-; CHECK-NEXT:    st.param.b16 [param0], %rs1;
-; CHECK-NEXT:    st.param.b8 [param0+2], %rs2;
-; CHECK-NEXT:    st.param.b8 [param0+3], %rs3;
-; CHECK-NEXT:    st.param.b8 [param0+4], %rs4;
-; CHECK-NEXT:    st.param.b64 [param0+8], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:    st.param.b8 [param0+4], %rs1;
+; CHECK-NEXT:    st.param.b64 [param0+8], %rd1;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), test_s_i8i16p, (param0);
-; CHECK-NEXT:    ld.param.b16 %rs7, [retval0];
-; CHECK-NEXT:    ld.param.b8 %rs8, [retval0+2];
-; CHECK-NEXT:    ld.param.b8 %rs9, [retval0+3];
-; CHECK-NEXT:    ld.param.b8 %rs10, [retval0+4];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+8];
+; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+2];
+; CHECK-NEXT:    ld.param.b16 %rs3, [retval0];
+; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+4];
+; CHECK-NEXT:    ld.param.b8 %rs5, [retval0+3];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
-; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs10;
-; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs9;
+; CHECK-NEXT:    shl.b16 %rs8, %rs4, 8;
+; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs5;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs5;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+8], %rd2;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-NEXT:    shr.u16 %rs12, %rs9, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs12;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
   ret %s_i8i16p %r
@@ -64,56 +62,51 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
 define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 ; CHECK-LABEL: test_s_i8i32p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<12>;
-; CHECK-NEXT:    .reg .b32 %r<20>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<24>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %r3, [test_s_i8i32p_param_0+6];
-; CHECK-NEXT:    shl.b32 %r4, %r3, 8;
-; CHECK-NEXT:    ld.param.b8 %r5, [test_s_i8i32p_param_0+5];
-; CHECK-NEXT:    or.b32 %r6, %r4, %r5;
-; CHECK-NEXT:    ld.param.b8 %r7, [test_s_i8i32p_param_0+7];
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    ld.param.b8 %r9, [test_s_i8i32p_param_0+8];
-; CHECK-NEXT:    shl.b32 %r10, %r9, 24;
-; CHECK-NEXT:    or.b32 %r11, %r10, %r8;
-; CHECK-NEXT:    or.b32 %r2, %r11, %r6;
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8i32p_param_0+16];
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_s_i8i32p_param_0+4];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8i32p_param_0];
-; CHECK-NEXT:    shr.u32 %r12, %r2, 8;
-; CHECK-NEXT:    shr.u32 %r13, %r11, 16;
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_s_i8i32p_param_0+4];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8i32p_param_0+16];
+; CHECK-NEXT:    ld.param.b8 %r2, [test_s_i8i32p_param_0+6];
+; CHECK-NEXT:    ld.param.b8 %r3, [test_s_i8i32p_param_0+7];
+; CHECK-NEXT:    ld.param.b8 %r4, [test_s_i8i32p_param_0+8];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[24];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    st.param.b8 [param0+4], %rs1;
-; CHECK-NEXT:    st.param.b8 [param0+5], %r2;
-; CHECK-NEXT:    st.param.b8 [param0+6], %r12;
-; CHECK-NEXT:    st.param.b8 [param0+7], %r13;
-; CHECK-NEXT:    st.param.b8 [param0+8], %r9;
-; CHECK-NEXT:    st.param.b64 [param0+16], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:    st.param.b8 [param0+8], %r4;
+; CHECK-NEXT:    st.param.b8 [param0+7], %r3;
+; CHECK-NEXT:    st.param.b8 [param0+6], %r2;
+; CHECK-NEXT:    st.param.b64 [param0+16], %rd1;
+; CHECK-NEXT:    st.param.b16 [param0+4], %rs1;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), test_s_i8i32p, (param0);
-; CHECK-NEXT:    ld.param.b32 %r14, [retval0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+4];
-; CHECK-NEXT:    ld.param.b8 %rs3, [retval0+5];
-; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+6];
-; CHECK-NEXT:    ld.param.b8 %rs5, [retval0+7];
-; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+16];
+; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+4];
+; CHECK-NEXT:    ld.param.b32 %r5, [retval0];
+; CHECK-NEXT:    ld.param.b8 %r6, [retval0+8];
+; CHECK-NEXT:    ld.param.b8 %r7, [retval0+7];
+; CHECK-NEXT:    ld.param.b8 %r8, [retval0+6];
+; CHECK-NEXT:    ld.param.b8 %r9, [retval0+5];
 ; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r18, %rs6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r18;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r15;
+; CHECK-NEXT:    shl.b32 %r12, %r8, 8;
+; CHECK-NEXT:    or.b32 %r13, %r12, %r9;
+; CHECK-NEXT:    shl.b32 %r15, %r7, 16;
+; CHECK-NEXT:    shl.b32 %r17, %r6, 24;
+; CHECK-NEXT:    or.b32 %r18, %r17, %r15;
+; CHECK-NEXT:    or.b32 %r19, %r18, %r13;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r9;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    shr.u32 %r21, %r19, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r21;
+; CHECK-NEXT:    shr.u32 %r22, %r19, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r22;
+; CHECK-NEXT:    shr.u32 %r23, %r19, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r23;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
   ret %s_i8i32p %r
@@ -123,112 +116,66 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
 ; CHECK-LABEL: test_s_i8i64p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b64 %rd<68>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b64 %rd<46>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rd4, [test_s_i8i64p_param_0+10];
-; CHECK-NEXT:    shl.b64 %rd5, %rd4, 8;
-; CHECK-NEXT:    ld.param.b8 %rd6, [test_s_i8i64p_param_0+9];
-; CHECK-NEXT:    or.b64 %rd7, %rd5, %rd6;
-; CHECK-NEXT:    ld.param.b8 %rd8, [test_s_i8i64p_param_0+11];
-; CHECK-NEXT:    shl.b64 %rd9, %rd8, 16;
-; CHECK-NEXT:    ld.param.b8 %rd10, [test_s_i8i64p_param_0+12];
-; CHECK-NEXT:    shl.b64 %rd11, %rd10, 24;
-; CHECK-NEXT:    or.b64 %rd12, %rd11, %rd9;
-; CHECK-NEXT:    or.b64 %rd13, %rd12, %rd7;
-; CHECK-NEXT:    ld.param.b8 %rd14, [test_s_i8i64p_param_0+14];
-; CHECK-NEXT:    shl.b64 %rd15, %rd14, 8;
-; CHECK-NEXT:    ld.param.b8 %rd16, [test_s_i8i64p_param_0+13];
-; CHECK-NEXT:    or.b64 %rd17, %rd15, %rd16;
-; CHECK-NEXT:    ld.param.b8 %rd18, [test_s_i8i64p_param_0+15];
-; CHECK-NEXT:    shl.b64 %rd19, %rd18, 16;
-; CHECK-NEXT:    ld.param.b8 %rd20, [test_s_i8i64p_param_0+16];
-; CHECK-NEXT:    shl.b64 %rd21, %rd20, 24;
-; CHECK-NEXT:    or.b64 %rd22, %rd21, %rd19;
-; CHECK-NEXT:    or.b64 %rd23, %rd22, %rd17;
-; CHECK-NEXT:    shl.b64 %rd24, %rd23, 32;
-; CHECK-NEXT:    or.b64 %rd2, %rd24, %rd13;
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_s_i8i64p_param_0+24];
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_s_i8i64p_param_0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8i64p_param_0];
-; CHECK-NEXT:    shr.u64 %rd25, %rd2, 8;
-; CHECK-NEXT:    shr.u64 %rd26, %rd2, 16;
-; CHECK-NEXT:    shr.u64 %rd27, %rd2, 24;
-; CHECK-NEXT:    bfe.u64 %rd28, %rd23, 8, 24;
-; CHECK-NEXT:    bfe.u64 %rd29, %rd23, 16, 16;
-; CHECK-NEXT:    bfe.u64 %rd30, %rd23, 24, 8;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_s_i8i64p_param_0+8];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_s_i8i64p_param_0+24];
+; CHECK-NEXT:    ld.param.b8 %rd4, [test_s_i8i64p_param_0+16];
 ; CHECK-NEXT:    { // callseq 2, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[32];
-; CHECK-NEXT:    st.param.b64 [param0], %rd1;
-; CHECK-NEXT:    st.param.b8 [param0+8], %rs1;
-; CHECK-NEXT:    st.param.b8 [param0+9], %rd2;
-; CHECK-NEXT:    st.param.b8 [param0+10], %rd25;
-; CHECK-NEXT:    st.param.b8 [param0+11], %rd26;
-; CHECK-NEXT:    st.param.b8 [param0+12], %rd27;
-; CHECK-NEXT:    st.param.b8 [param0+13], %rd23;
-; CHECK-NEXT:    st.param.b8 [param0+14], %rd28;
-; CHECK-NEXT:    st.param.b8 [param0+15], %rd29;
-; CHECK-NEXT:    st.param.b8 [param0+16], %rd30;
-; CHECK-NEXT:    st.param.b64 [param0+24], %rd3;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:    st.param.b8 [param0+16], %rd4;
+; CHECK-NEXT:    st.param.b64 [param0+24], %rd3;
+; CHECK-NEXT:    st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    call.uni (retval0), test_s_i8i64p, (param0);
-; CHECK-NEXT:    ld.param.b64 %rd31, [retval0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+8];
-; CHECK-NEXT:    ld.param.b8 %rs3, [retval0+9];
-; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+10];
-; CHECK-NEXT:    ld.param.b8 %rs5, [retval0+11];
-; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+12];
-; CHECK-NEXT:    ld.param.b8 %rs7, [retval0+13];
-; CHECK-NEXT:    ld.param.b8 %rs8, [retval0+14];
-; CHECK-NEXT:    ld.param.b8 %rs9, [retval0+15];
-; CHECK-NEXT:    ld.param.b8 %rs10, [retval0+16];
-; CHECK-NEXT:    ld.param.b64 %rd32, [retval0+24];
+; CHECK-NEXT:    ld.param.b64 %rd5, [retval0+24];
+; CHECK-NEXT:    ld.param.b8 %rs1, [retval0+8];
+; CHECK-NEXT:    ld.param.b64 %rd6, [retval0];
+; CHECK-NEXT:    ld.param.b8 %rd7, [retval0+16];
+; CHECK-NEXT:    ld.param.b8 %rd8, [retval0+15];
+; CHECK-NEXT:    ld.param.b8 %rd9, [retval0+14];
+; CHECK-NEXT:    ld.param.b8 %rd10, [retval0+13];
+; CHECK-NEXT:    ld.param.b8 %rd11, [retval0+12];
+; CHECK-NEXT:    ld.param.b8 %rd12, [retval0+11];
+; CHECK-NEXT:    ld.param.b8 %rd13, [retval0+10];
+; CHECK-NEXT:    ld.param.b8 %rd14, [retval0+9];
 ; CHECK-NEXT:    } // callseq 2
-; CHECK-NEXT:    cvt.u64.u16 %rd33, %rs3;
-; CHECK-NEXT:    and.b64 %rd34, %rd33, 255;
-; CHECK-NEXT:    cvt.u64.u16 %rd35, %rs4;
-; CHECK-NEXT:    and.b64 %rd36, %rd35, 255;
-; CHECK-NEXT:    shl.b64 %rd37, %rd36, 8;
-; CHECK-NEXT:    or.b64 %rd38, %rd34, %rd37;
-; CHECK-NEXT:    cvt.u64.u16 %rd39, %rs5;
-; CHECK-NEXT:    and.b64 %rd40, %rd39, 255;
-; CHECK-NEXT:    shl.b64 %rd41, %rd40, 16;
-; CHECK-NEXT:    or.b64 %rd42, %rd38, %rd41;
-; CHECK-NEXT:    cvt.u64.u16 %rd43, %rs6;
-; CHECK-NEXT:    and.b64 %rd44, %rd43, 255;
-; CHECK-NEXT:    shl.b64 %rd45, %rd44, 24;
-; CHECK-NEXT:    or.b64 %rd46, %rd42, %rd45;
-; CHECK-NEXT:    cvt.u64.u16 %rd47, %rs7;
-; CHECK-NEXT:    and.b64 %rd48, %rd47, 255;
-; CHECK-NEXT:    shl.b64 %rd49, %rd48, 32;
-; CHECK-NEXT:    or.b64 %rd50, %rd46, %rd49;
-; CHECK-NEXT:    cvt.u64.u16 %rd51, %rs8;
-; CHECK-NEXT:    and.b64 %rd52, %rd51, 255;
-; CHECK-NEXT:    shl.b64 %rd53, %rd52, 40;
-; CHECK-NEXT:    or.b64 %rd54, %rd50, %rd53;
-; CHECK-NEXT:    cvt.u64.u16 %rd55, %rs9;
-; CHECK-NEXT:    and.b64 %rd56, %rd55, 255;
-; CHECK-NEXT:    shl.b64 %rd57, %rd56, 48;
-; CHECK-NEXT:    or.b64 %rd58, %rd54, %rd57;
-; CHECK-NEXT:    cvt.u64.u16 %rd59, %rs10;
-; CHECK-NEXT:    shl.b64 %rd60, %rd59, 56;
-; CHECK-NEXT:    or.b64 %rd61, %rd58, %rd60;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd31;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs2;
+; CHECK-NEXT:    shl.b64 %rd17, %rd13, 8;
+; CHECK-NEXT:    or.b64 %rd18, %rd17, %rd14;
+; CHECK-NEXT:    shl.b64 %rd20, %rd12, 16;
+; CHECK-NEXT:    shl.b64 %rd22, %rd11, 24;
+; CHECK-NEXT:    or.b64 %rd23, %rd22, %rd20;
+; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd18;
+; CHECK-NEXT:    shl.b64 %rd27, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd28, %rd27, %rd10;
+; CHECK-NEXT:    shl.b64 %rd30, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd32, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd33, %rd32, %rd30;
+; CHECK-NEXT:    or.b64 %rd34, %rd33, %rd28;
+; CHECK-NEXT:    shl.b64 %rd35, %rd34, 32;
+; CHECK-NEXT:    or.b64 %rd36, %rd35, %rd24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd14;
+; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd5;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    shr.u64 %rd39, %rd36, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd39;
+; CHECK-NEXT:    shr.u64 %rd40, %rd36, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd40;
+; CHECK-NEXT:    shr.u64 %rd41, %rd36, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd41;
+; CHECK-NEXT:    shr.u64 %rd42, %rd36, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd42;
+; CHECK-NEXT:    shr.u64 %rd43, %rd36, 24;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd43;
-; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd39;
-; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd35;
-; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd33;
-; CHECK-NEXT:    shr.u64 %rd64, %rd50, 32;
-; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd64;
-; CHECK-NEXT:    shr.u64 %rd65, %rd54, 40;
-; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd65;
-; CHECK-NEXT:    shr.u64 %rd66, %rd58, 48;
-; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd66;
-; CHECK-NEXT:    shr.u64 %rd67, %rd61, 56;
-; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd67;
-; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd32;
+; CHECK-NEXT:    shr.u64 %rd44, %rd36, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd44;
+; CHECK-NEXT:    shr.u64 %rd45, %rd36, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd45;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
   ret %s_i8i64p %r
@@ -242,33 +189,32 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs4, [test_s_i8f16p_param_0+4];
-; CHECK-NEXT:    shl.b16 %rs5, %rs4, 8;
-; CHECK-NEXT:    ld.param.b8 %rs6, [test_s_i8f16p_param_0+3];
-; CHECK-NEXT:    or.b16 %rs3, %rs5, %rs6;
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f16p_param_0+8];
-; CHECK-NEXT:    ld.param.b8 %rs2, [test_s_i8f16p_param_0+2];
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_s_i8f16p_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [test_s_i8f16p_param_0+2];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f16p_param_0+8];
+; CHECK-NEXT:    ld.param.b8 %rs3, [test_s_i8f16p_param_0+4];
 ; CHECK-NEXT:    { // callseq 3, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[16];
-; CHECK-NEXT:    st.param.b16 [param0], %rs1;
-; CHECK-NEXT:    st.param.b8 [param0+2], %rs2;
-; CHECK-NEXT:    st.param.b8 [param0+3], %rs3;
-; CHECK-NEXT:    st.param.b8 [param0+4], %rs4;
-; CHECK-NEXT:    st.param.b64 [param0+8], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:    st.param.b8 [param0+4], %rs3;
+; CHECK-NEXT:    st.param.b64 [param0+8], %rd1;
+; CHECK-NEXT:    st.param.b16 [param0+2], %rs2;
+; CHECK-NEXT:    st.param.b16 [param0], %rs1;
 ; CHECK-NEXT:    call.uni (retval0), test_s_i8f16p, (param0);
-; CHECK-NEXT:    ld.param.b16 %rs7, [retval0];
-; CHECK-NEXT:    ld.param.b8 %rs8, [retval0+2];
-; CHECK-NEXT:    ld.param.b8 %rs9, [retval0+3];
-; CHECK-NEXT:    ld.param.b8 %rs10, [retval0+4];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+8];
+; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+2];
+; CHECK-NEXT:    ld.param.b16 %rs5, [retval0];
+; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+4];
+; CHECK-NEXT:    ld.param.b8 %rs7, [retval0+3];
 ; CHECK-NEXT:    } // callseq 3
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
-; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs8;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs10;
-; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs9;
+; CHECK-NEXT:    shl.b16 %rs10, %rs6, 8;
+; CHECK-NEXT:    or.b16 %rs11, %rs10, %rs7;
+; CHECK-NEXT:    st.param.b8 [func_retval0+3], %rs7;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+8], %rd2;
+; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-NEXT:    shr.u16 %rs14, %rs11, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs14;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
   ret %s_i8f16p %r
@@ -278,56 +224,51 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
 define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-LABEL: test_s_i8f16x2p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<12>;
-; CHECK-NEXT:    .reg .b32 %r<20>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<24>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %r3, [test_s_i8f16x2p_param_0+6];
-; CHECK-NEXT:    shl.b32 %r4, %r3, 8;
-; CHECK-NEXT:    ld.param.b8 %r5, [test_s_i8f16x2p_param_0+5];
-; CHECK-NEXT:    or.b32 %r6, %r4, %r5;
-; CHECK-NEXT:    ld.param.b8 %r7, [test_s_i8f16x2p_param_0+7];
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    ld.param.b8 %r9, [test_s_i8f16x2p_param_0+8];
-; CHECK-NEXT:    shl.b32 %r10, %r9, 24;
-; CHECK-NEXT:    or.b32 %r11, %r10, %r8;
-; CHECK-NEXT:    or.b32 %r2, %r11, %r6;
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f16x2p_param_0+16];
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_s_i8f16x2p_param_0+4];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8f16x2p_param_0];
-; CHECK-NEXT:    shr.u32 %r12, %r2, 8;
-; CHECK-NEXT:    shr.u32 %r13, %r11, 16;
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_s_i8f16x2p_param_0+4];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f16x2p_param_0+16];
+; CHECK-NEXT:    ld.param.b8 %r2, [test_s_i8f16x2p_param_0+6];
+; CHECK-NEXT:    ld.param.b8 %r3, [test_s_i8f16x2p_param_0+7];
+; CHECK-NEXT:    ld.param.b8 %r4, [test_s_i8f16x2p_param_0+8];
 ; CHECK-NEXT:    { // callseq 4, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[24];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    st.param.b8 [param0+4], %rs1;
-; CHECK-NEXT:    st.param.b8 [param0+5], %r2;
-; CHECK-NEXT:    st.param.b8 [param0+6], %r12;
-; CHECK-NEXT:    st.param.b8 [param0+7], %r13;
-; CHECK-NEXT:    st.param.b8 [param0+8], %r9;
-; CHECK-NEXT:    st.param.b64 [param0+16], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:    st.param.b8 [param0+8], %r4;
+; CHECK-NEXT:    st.param.b8 [param0+7], %r3;
+; CHECK-NEXT:    st.param.b8 [param0+6], %r2;
+; CHECK-NEXT:    st.param.b64 [param0+16], %rd1;
+; CHECK-NEXT:    st.param.b16 [param0+4], %rs1;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), test_s_i8f16x2p, (param0);
-; CHECK-NEXT:    ld.param.b32 %r14, [retval0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+4];
-; CHECK-NEXT:    ld.param.b8 %rs3, [retval0+5];
-; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+6];
-; CHECK-NEXT:    ld.param.b8 %rs5, [retval0+7];
-; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+16];
+; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+4];
+; CHECK-NEXT:    ld.param.b32 %r5, [retval0];
+; CHECK-NEXT:    ld.param.b8 %r6, [retval0+8];
+; CHECK-NEXT:    ld.param.b8 %r7, [retval0+7];
+; CHECK-NEXT:    ld.param.b8 %r8, [retval0+6];
+; CHECK-NEXT:    ld.param.b8 %r9, [retval0+5];
 ; CHECK-NEXT:    } // callseq 4
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r18, %rs6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r18;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r15;
+; CHECK-NEXT:    shl.b32 %r12, %r8, 8;
+; CHECK-NEXT:    or.b32 %r13, %r12, %r9;
+; CHECK-NEXT:    shl.b32 %r15, %r7, 16;
+; CHECK-NEXT:    shl.b32 %r17, %r6, 24;
+; CHECK-NEXT:    or.b32 %r18, %r17, %r15;
+; CHECK-NEXT:    or.b32 %r19, %r18, %r13;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r9;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    shr.u32 %r21, %r19, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r21;
+; CHECK-NEXT:    shr.u32 %r22, %r19, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r22;
+; CHECK-NEXT:    shr.u32 %r23, %r19, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r23;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
   ret %s_i8f16x2p %r
@@ -337,56 +278,51 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-LABEL: test_s_i8f32p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<12>;
-; CHECK-NEXT:    .reg .b32 %r<20>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<24>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %r3, [test_s_i8f32p_param_0+6];
-; CHECK-NEXT:    shl.b32 %r4, %r3, 8;
-; CHECK-NEXT:    ld.param.b8 %r5, [test_s_i8f32p_param_0+5];
-; CHECK-NEXT:    or.b32 %r6, %r4, %r5;
-; CHECK-NEXT:    ld.param.b8 %r7, [test_s_i8f32p_param_0+7];
-; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
-; CHECK-NEXT:    ld.param.b8 %r9, [test_s_i8f32p_param_0+8];
-; CHECK-NEXT:    shl.b32 %r10, %r9, 24;
-; CHECK-NEXT:    or.b32 %r11, %r10, %r8;
-; CHECK-NEXT:    or.b32 %r2, %r11, %r6;
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f32p_param_0+16];
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_s_i8f32p_param_0+4];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_s_i8f32p_param_0];
-; CHECK-NEXT:    shr.u32 %r12, %r2, 8;
-; CHECK-NEXT:    shr.u32 %r13, %r11, 16;
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_s_i8f32p_param_0+4];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f32p_param_0+16];
+; CHECK-NEXT:    ld.param.b8 %r2, [test_s_i8f32p_param_0+6];
+; CHECK-NEXT:    ld.param.b8 %r3, [test_s_i8f32p_param_0+7];
+; CHECK-NEXT:    ld.param.b8 %r4, [test_s_i8f32p_param_0+8];
 ; CHECK-NEXT:    { // callseq 5, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[24];
-; CHECK-NEXT:    st.param.b32 [param0], %r1;
-; CHECK-NEXT:    st.param.b8 [param0+4], %rs1;
-; CHECK-NEXT:    st.param.b8 [param0+5], %r2;
-; CHECK-NEXT:    st.param.b8 [param0+6], %r12;
-; CHECK-NEXT:    st.param.b8 [param0+7], %r13;
-; CHECK-NEXT:    st.param.b8 [param0+8], %r9;
-; CHECK-NEXT:    st.param.b64 [param0+16], %rd1;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:    st.param.b8 [param0+8], %r4;
+; CHECK-NEXT:    st.param.b8 [param0+7], %r3;
+; CHECK-NEXT:    st.param.b8 [param0+6], %r2;
+; CHECK-NEXT:    st.param.b64 [param0+16], %rd1;
+; CHECK-NEXT:    st.param.b16 [param0+4], %rs1;
+; CHECK-NEXT:    st.param.b32 [param0], %r1;
 ; CHECK-NEXT:    call.uni (retval0), test_s_i8f32p, (param0);
-; CHECK-NEXT:    ld.param.b32 %r14, [retval0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+4];
-; CHECK-NEXT:    ld.param.b8 %rs3, [retval0+5];
-; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+6];
-; CHECK-NEXT:    ld.param.b8 %rs5, [retval0+7];
-; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+16];
+; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+4];
+; CHECK-NEXT:    ld.param.b32 %r5, [retval0];
+; CHECK-NEXT:    ld.param.b8 %r6, [retval0+8];
+; CHECK-NEXT:    ld.param.b8 %r7, [retval0+7];
+; CHECK-NEXT:    ld.param.b8 %r8, [retval0+6];
+; CHECK-NEXT:    ld.param.b8 %r9, [retval0+5];
 ; CHECK-NEXT:    } // callseq 5
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r18, %rs6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
-; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r18;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r16;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r15;
+; CHECK-NEXT:    shl.b32 %r12, %r8, 8;
+; CHECK-NEXT:    or.b32 %r13, %r12, %r9;
+; CHECK-NEXT:    shl.b32 %r15, %r7, 16;
+; CHECK-NEXT:    shl.b32 %r17, %r6, 24;
+; CHECK-NEXT:    or.b32 %r18, %r17, %r15;
+; CHECK-NEXT:    or.b32 %r19, %r18, %r13;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r9;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
+; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    shr.u32 %r21, %r19, 24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r21;
+; CHECK-NEXT:    shr.u32 %r22, %r19, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r22;
+; CHECK-NEXT:    shr.u32 %r23, %r19, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r23;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
   ret %s_i8f32p %r
@@ -396,112 +332,66 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
 ; CHECK-LABEL: test_s_i8f64p(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b64 %rd<68>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b64 %rd<46>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rd4, [test_s_i8f64p_param_0+10];
-; CHECK-NEXT:    shl.b64 %rd5, %rd4, 8;
-; CHECK-NEXT:    ld.param.b8 %rd6, [test_s_i8f64p_param_0+9];
-; CHECK-NEXT:    or.b64 %rd7, %rd5, %rd6;
-; CHECK-NEXT:    ld.param.b8 %rd8, [test_s_i8f64p_param_0+11];
-; CHECK-NEXT:    shl.b64 %rd9, %rd8, 16;
-; CHECK-NEXT:    ld.param.b8 %rd10, [test_s_i8f64p_param_0+12];
-; CHECK-NEXT:    shl.b64 %rd11, %rd10, 24;
-; CHECK-NEXT:    or.b64 %rd12, %rd11, %rd9;
-; CHECK-NEXT:    or.b64 %rd13, %rd12, %rd7;
-; CHECK-NEXT:    ld.param.b8 %rd14, [test_s_i8f64p_param_0+14];
-; CHECK-NEXT:    shl.b64 %rd15, %rd14, 8;
-; CHECK-NEXT:    ld.param.b8 %rd16, [test_s_i8f64p_param_0+13];
-; CHECK-NEXT:    or.b64 %rd17, %rd15, %rd16;
-; CHECK-NEXT:    ld.param.b8 %rd18, [test_s_i8f64p_param_0+15];
-; CHECK-NEXT:    shl.b64 %rd19, %rd18, 16;
-; CHECK-NEXT:    ld.param.b8 %rd20, [test_s_i8f64p_param_0+16];
-; CHECK-NEXT:    shl.b64 %rd21, %rd20, 24;
-; CHECK-NEXT:    or.b64 %rd22, %rd21, %rd19;
-; CHECK-NEXT:    or.b64 %rd23, %rd22, %rd17;
-; CHECK-NEXT:    shl.b64 %rd24, %rd23, 32;
-; CHECK-NEXT:    or.b64 %rd2, %rd24, %rd13;
-; CHECK-NEXT:    ld.param.b64 %rd3, [test_s_i8f64p_param_0+24];
-; CHECK-NEXT:    ld.param.b8 %rs1, [test_s_i8f64p_param_0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_s_i8f64p_param_0];
-; CHECK-NEXT:    shr.u64 %rd25, %rd2, 8;
-; CHECK-NEXT:    shr.u64 %rd26, %rd2, 16;
-; CHECK-NEXT:    shr.u64 %rd27, %rd2, 24;
-; CHECK-NEXT:    bfe.u64 %rd28, %rd23, 8, 24;
-; CHECK-NEXT:    bfe.u64 %rd29, %rd23, 16, 16;
-; CHECK-NEXT:    bfe.u64 %rd30, %rd23, 24, 8;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_s_i8f64p_param_0+8];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_s_i8f64p_param_0+24];
+; CHECK-NEXT:    ld.param.b8 %rd4, [test_s_i8f64p_param_0+16];
 ; CHECK-NEXT:    { // callseq 6, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[32];
-; CHECK-NEXT:    st.param.b64 [param0], %rd1;
-; CHECK-NEXT:    st.param.b8 [param0+8], %rs1;
-; CHECK-NEXT:    st.param.b8 [param0+9], %rd2;
-; CHECK-NEXT:    st.param.b8 [param0+10], %rd25;
-; CHECK-NEXT:    st.param.b8 [param0+11], %rd26;
-; CHECK-NEXT:    st.param.b8 [param0+12], %rd27;
-; CHECK-NEXT:    st.param.b8 [param0+13], %rd23;
-; CHECK-NEXT:    st.param.b8 [param0+14], %rd28;
-; CHECK-NEXT:    st.param.b8 [param0+15], %rd29;
-; CHECK-NEXT:    st.param.b8 [param0+16], %rd30;
-; CHECK-NEXT:    st.param.b64 [param0+24], %rd3;
 ; CHECK-NEXT:    .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:    st.param.b8 [param0+16], %rd4;
+; CHECK-NEXT:    st.param.b64 [param0+24], %rd3;
+; CHECK-NEXT:    st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT:    st.param.b64 [param0], %rd1;
 ; CHECK-NEXT:    call.uni (retval0), test_s_i8f64p, (param0);
-; CHECK-NEXT:    ld.param.b64 %rd31, [retval0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [retval0+8];
-; CHECK-NEXT:    ld.param.b8 %rs3, [retval0+9];
-; CHECK-NEXT:    ld.param.b8 %rs4, [retval0+10];
-; CHECK-NEXT:    ld.param.b8 %rs5, [retval0+11];
-; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+12];
-; CHECK-NEXT:    ld.param.b8 %rs7, [retval0+13];
-; CHECK-NEXT:    ld.param.b8 %rs8, [retval0+14];
-; CHECK-NEXT:    ld.param.b8 %rs9, [retval0+15];
-; CHECK-NEXT:    ld.param.b8 %rs10, [retval0+16];
-; CHECK-NEXT:    ld.param.b64 %rd32, [retval0+24];
+; CHECK-NEXT:    ld.param.b64 %rd5, [retval0+24];
+; CHECK-NEXT:    ld.param.b8 %rs1, [retval0+8];
+; CHECK-NEXT:    ld.param.b64 %rd6, [retval0];
+; CHECK-NEXT:    ld.param.b8 %rd7, [retval0+16];
+; CHECK-NEXT:    ld.param.b8 %rd8, [retval0+15];
+; CHECK-NEXT:    ld.param.b8 %rd9, [retval0+14];
+; CHECK-NEXT:    ld.param.b8 %rd10, [retval0+13];
+; CHECK-NEXT:    ld.param.b8 %rd11, [retval0+12];
+; CHECK-NEXT:    ld.param.b8 %rd12, [retval0+11];
+; CHECK-NEXT:    ld.param.b8 %rd13, [retval0+10];
+; CHECK-NEXT:    ld.param.b8 %rd14, [retval0+9];
 ; CHECK-NEXT:    } // callseq 6
-; CHECK-NEXT:    cvt.u64.u16 %rd33, %rs3;
-; CHECK-NEXT:    and.b64 %rd34, %rd33, 255;
-; CHECK-NEXT:    cvt.u64.u16 %rd35, %rs4;
-; CHECK-NEXT:    and.b64 %rd36, %rd35, 255;
-; CHECK-NEXT:    shl.b64 %rd37, %rd36, 8;
-; CHECK-NEXT:    or.b64 %rd38, %rd34, %rd37;
-; CHECK-NEXT:    cvt.u64.u16 %rd39, %rs5;
-; CHECK-NEXT:    and.b64 %rd40, %rd39, 255;
-; CHECK-NEXT:    shl.b64 %rd41, %rd40, 16;
-; CHECK-NEXT:    or.b64 %rd42, %rd38, %rd41;
-; CHECK-NEXT:    cvt.u64.u16 %rd43, %rs6;
-; CHECK-NEXT:    and.b64 %rd44, %rd43, 255;
-; CHECK-NEXT:    shl.b64 %rd45, %rd44, 24;
-; CHECK-NEXT:    or.b64 %rd46, %rd42, %rd45;
-; CHECK-NEXT:    cvt.u64.u16 %rd47, %rs7;
-; CHECK-NEXT:    and.b64 %rd48, %rd47, 255;
-; CHECK-NEXT:    shl.b64 %rd49, %rd48, 32;
-; CHECK-NEXT:    or.b64 %rd50, %rd46, %rd49;
-; CHECK-NEXT:    cvt.u64.u16 %rd51, %rs8;
-; CHECK-NEXT:    and.b64 %rd52, %rd51, 255;
-; CHECK-NEXT:    shl.b64 %rd53, %rd52, 40;
-; CHECK-NEXT:    or.b64 %rd54, %rd50, %rd53;
-; CHECK-NEXT:    cvt.u64.u16 %rd55, %rs9;
-; CHECK-NEXT:    and.b64 %rd56, %rd55, 255;
-; CHECK-NEXT:    shl.b64 %rd57, %rd56, 48;
-; CHECK-NEXT:    or.b64 %rd58, %rd54, %rd57;
-; CHECK-NEXT:    cvt.u64.u16 %rd59, %rs10;
-; CHECK-NEXT:    shl.b64 %rd60, %rd59, 56;
-; CHECK-NEXT:    or.b64 %rd61, %rd58, %rd60;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd31;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs2;
+; CHECK-NEXT:    shl.b64 %rd17, %rd13, 8;
+; CHECK-NEXT:    or.b64 %rd18, %rd17, %rd14;
+; CHECK-NEXT:    shl.b64 %rd20, %rd12, 16;
+; CHECK-NEXT:    shl.b64 %rd22, %rd11, 24;
+; CHECK-NEXT:    or.b64 %rd23, %rd22, %rd20;
+; CHECK-NEXT:    or.b64 %rd24, %rd23, %rd18;
+; CHECK-NEXT:    shl.b64 %rd27, %rd9, 8;
+; CHECK-NEXT:    or.b64 %rd28, %rd27, %rd10;
+; CHECK-NEXT:    shl.b64 %rd30, %rd8, 16;
+; CHECK-NEXT:    shl.b64 %rd32, %rd7, 24;
+; CHECK-NEXT:    or.b64 %rd33, %rd32, %rd30;
+; CHECK-NEXT:    or.b64 %rd34, %rd33, %rd28;
+; CHECK-NEXT:    shl.b64 %rd35, %rd34, 32;
+; CHECK-NEXT:    or.b64 %rd36, %rd35, %rd24;
+; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd14;
+; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd5;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    shr.u64 %rd39, %rd36, 56;
+; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd39;
+; CHECK-NEXT:    shr.u64 %rd40, %rd36, 48;
+; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd40;
+; CHECK-NEXT:    shr.u64 %rd41, %rd36, 40;
+; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd41;
+; CHECK-NEXT:    shr.u64 %rd42, %rd36, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd42;
+; CHECK-NEXT:    shr.u64 %rd43, %rd36, 24;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd43;
-; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd39;
-; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd35;
-; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd33;
-; CHECK-NEXT:    shr.u64 %rd64, %rd50, 32;
-; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd64;
-; CHECK-NEXT:    shr.u64 %rd65, %rd54, 40;
-; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd65;
-; CHECK-NEXT:    shr.u64 %rd66, %rd58, 48;
-; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd66;
-; CHECK-NEXT:    shr.u64 %rd67, %rd61, 56;
-; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd67;
-; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd32;
+; CHECK-NEXT:    shr.u64 %rd44, %rd36, 16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd44;
+; CHECK-NEXT:    shr.u64 %rd45, %rd36, 8;
+; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd45;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
   ret %s_i8f64p %r
diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll
index 3ca729f..9e312a2 100644
--- a/llvm/test/CodeGen/NVPTX/vaargs.ll
+++ b/llvm/test/CodeGen/NVPTX/vaargs.ll
@@ -89,14 +89,14 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) {
 ; CHECK-NEXT:    ld.param.b32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0];
 
 ; Store arguments to an array
-; CHECK32:  .param .align 8 .b8 param1[28];
-; CHECK64:  .param .align 8 .b8 param1[32];
-; CHECK-NEXT:    st.param.b32 [param1], [[ARG_I32]];
-; CHECK-NEXT:    st.param.b64 [param1+8], [[ARG_I64]];
-; CHECK-NEXT:    st.param.b64 [param1+16], [[ARG_DOUBLE]];
-; CHECK-NEXT:    st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]];
-; CHECK-NEXT:    .param .b32 retval0;
-; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[]
+; CHECK32:      .param .align 8 .b8 param1[28];
+; CHECK64:      .param .align 8 .b8 param1[32];
+; CHECK-DAG:    .param .b32 retval0;
+; CHECK-DAG:    st.param.b32 [param1], [[ARG_I32]];
+; CHECK-DAG:    st.param.b64 [param1+8], [[ARG_I64]];
+; CHECK-DAG:    st.param.b64 [param1+16], [[ARG_DOUBLE]];
+; CHECK-DAG:    st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]];
+; CHECK-DAG:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[]
 
 entry:
   %ptr = load ptr, ptr addrspacecast (ptr addrspace(1) @foo_ptr to ptr), align 8
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index ad2e704..a9b3675 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -115,13 +115,13 @@ define dso_local i32 @foo() {
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], 1;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+24], 4607182418800017408;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+32], 4607182418800017408;
-; CHECK-PTX-NEXT:    add.u64 %rd1, %SP, 0;
 ; CHECK-PTX-NEXT:    { // callseq 0, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
-; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd1;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
+; CHECK-PTX-NEXT:    add.u64 %rd1, %SP, 0;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    call.uni (retval0), variadics1, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 0
@@ -218,13 +218,13 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    st.b32 [%SP+8], 1;
 ; CHECK-PTX-NEXT:    st.b8 [%SP+12], 1;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], 1;
-; CHECK-PTX-NEXT:    add.u64 %rd3, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
-; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd3;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
+; CHECK-PTX-NEXT:    add.u64 %rd3, %SP, 8;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd3;
+; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    call.uni (retval0), variadics2, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 1
@@ -289,13 +289,13 @@ define dso_local i32 @baz() {
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot5;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    st.v4.b32 [%SP], {1, 1, 1, 1};
-; CHECK-PTX-NEXT:    add.u64 %rd1, %SP, 0;
 ; CHECK-PTX-NEXT:    { // callseq 2, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
-; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd1;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
+; CHECK-PTX-NEXT:    add.u64 %rd1, %SP, 0;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-PTX-NEXT:    st.param.b32 [param0], 1;
 ; CHECK-PTX-NEXT:    call.uni (retval0), variadics3, (param0, param1);
 ; CHECK-PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 2
@@ -348,7 +348,6 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot7[24];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
@@ -360,18 +359,17 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    ld.global.nc.b64 %rd4, [__const_$_qux_$_s];
 ; CHECK-PTX-NEXT:    st.local.b64 [%rd2], %rd4;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], 1;
-; CHECK-PTX-NEXT:    ld.local.b64 %rd5, [%rd2];
-; CHECK-PTX-NEXT:    ld.local.b64 %rd6, [%rd2+8];
-; CHECK-PTX-NEXT:    add.u64 %rd7, %SP, 16;
 ; CHECK-PTX-NEXT:    { // callseq 3, 0
 ; CHECK-PTX-NEXT:    .param .align 8 .b8 param0[16];
-; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd5;
-; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd6;
 ; CHECK-PTX-NEXT:    .param .b64 param1;
-; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd7;
 ; CHECK-PTX-NEXT:    .param .b32 retval0;
+; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 16;
+; CHECK-PTX-NEXT:    st.param.b64 [param1], %rd5;
+; CHECK-PTX-NEXT:    ld.local.b64 %rd6, [%rd2+8];
+; CHECK-PTX-NEXT:    st.param.b64 [param0+8], %rd6;
+; CHECK-PTX-NEXT:    ld.local.b64 %rd7, [%rd2];
+; CHECK-PTX-NEXT:    st.param.b64 [param0], %rd7;
 ; CHECK-PTX-NEXT:    call.uni (retval0), variadics4, (param0, param1);
-; CHECK-PTX-NEXT:    ld.param.b32 %r1, [retval0];
 ; CHECK-PTX-NEXT:    } // callseq 3
 ; CHECK-PTX-NEXT:    ret;
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
new file mode 100644
index 0000000..238e200
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64le < %s | FileCheck %s
+
+define void @test(ptr %p1, ptr %p2) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -224(1)
+; CHECK-NEXT:    li 5, 48
+; CHECK-NEXT:    std 0, 240(1)
+; CHECK-NEXT:    std 27, 184(1) # 8-byte Folded Spill
+; CHECK-NEXT:    li 27, 16
+; CHECK-NEXT:    std 28, 192(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, 200(1) # 8-byte Folded Spill
+; CHECK-NEXT:    li 29, 32
+; CHECK-NEXT:    li 28, 48
+; CHECK-NEXT:    stxvd2x 56, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 64
+; CHECK-NEXT:    std 30, 208(1) # 8-byte Folded Spill
+; CHECK-NEXT:    mr 30, 4
+; CHECK-NEXT:    stxvd2x 57, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 80
+; CHECK-NEXT:    stxvd2x 58, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 96
+; CHECK-NEXT:    lxvd2x 58, 0, 3
+; CHECK-NEXT:    stxvd2x 59, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 112
+; CHECK-NEXT:    lxvd2x 59, 3, 27
+; CHECK-NEXT:    stxvd2x 60, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 128
+; CHECK-NEXT:    stxvd2x 61, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 144
+; CHECK-NEXT:    stxvd2x 62, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 160
+; CHECK-NEXT:    lxvd2x 62, 3, 28
+; CHECK-NEXT:    stxvd2x 63, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    lxvd2x 63, 3, 29
+; CHECK-NEXT:    xxswapd 57, 58
+; CHECK-NEXT:    xxswapd 1, 59
+; CHECK-NEXT:    xxswapd 60, 62
+; CHECK-NEXT:    xxswapd 61, 63
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 56, 1
+; CHECK-NEXT:    xxlor 1, 59, 59
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 60, 60
+; CHECK-NEXT:    xxmrgld 59, 0, 56
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 60, 1
+; CHECK-NEXT:    xxlor 1, 62, 62
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 61, 61
+; CHECK-NEXT:    xxmrgld 62, 0, 60
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 61, 1
+; CHECK-NEXT:    xxlor 1, 63, 63
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 57, 57
+; CHECK-NEXT:    xxmrgld 63, 0, 61
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 61, 1
+; CHECK-NEXT:    xxlor 1, 58, 58
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    li 3, 160
+; CHECK-NEXT:    stxvd2x 63, 30, 29
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    stxvd2x 62, 30, 28
+; CHECK-NEXT:    stxvd2x 59, 30, 27
+; CHECK-NEXT:    ld 29, 200(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 28, 192(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 27, 184(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 144
+; CHECK-NEXT:    xxmrgld 0, 0, 61
+; CHECK-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 128
+; CHECK-NEXT:    stxvd2x 0, 0, 30
+; CHECK-NEXT:    ld 30, 208(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lxvd2x 61, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 112
+; CHECK-NEXT:    lxvd2x 60, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 96
+; CHECK-NEXT:    lxvd2x 59, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 80
+; CHECK-NEXT:    lxvd2x 58, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 64
+; CHECK-NEXT:    lxvd2x 57, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 48
+; CHECK-NEXT:    lxvd2x 56, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 224
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+  %v = load <8 x double>, ptr %p1, align 64
+  %res = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %v)
+  store <8 x double> %res, ptr %p2, align 64
+  ret void
+}
+
+declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 807651c..dc80225 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -261,7 +261,7 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
 ; CHECK-LABEL: vector_deinterleave_load_factor3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <24 x i8>, ptr %p
   %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
@@ -269,8 +269,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p
   %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
   %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
   %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
-  %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
-  %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
+  %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
+  %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-load.ll
index 4eed3df..8c3ebb9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-segN-load.ll
@@ -1,107 +1,72 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple riscv64 -mattr=+zve64x,+zvl128b < %s | FileCheck %s
 
-define <8 x i8> @load_factor2(ptr %ptr) {
+define {<8 x i8>, <8 x i8>} @load_factor2(ptr %ptr) {
 ; CHECK-LABEL: load_factor2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
     %1 = call { <8 x i8>, <8 x i8> } @llvm.riscv.seg2.load.mask.v8i8.i64(ptr %ptr, <8 x i1> splat (i1 true), i64 8)
-    %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
-    %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
-    ret <8 x i8> %3
+    ret {<8 x i8>, <8 x i8>} %1
 }
 
-define <8 x i8> @load_factor3(ptr %ptr) {
+define {<8 x i8>, <8 x i8>, <8 x i8>} @load_factor3(ptr %ptr) {
 ; CHECK-LABEL: load_factor3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
     %1 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.riscv.seg3.load.mask.v8i8.i64(ptr %ptr, <8 x i1> splat (i1 true), i64 8)
-    %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-    %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-    %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-    ret <8 x i8> %4
+    ret { <8 x i8>, <8 x i8>, <8 x i8> } %1
 }
 
-define <8 x i8> @load_factor4(ptr %ptr) {
+define {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @load_factor4(ptr %ptr) {
 ; CHECK-LABEL: load_factor4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v5, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
     %1 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.riscv.seg4.load.mask.v8i8.i64(ptr %ptr, <8 x i1> splat (i1 true), i64 8)
-    %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-    %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-    %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-    %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-    ret <8 x i8> %5
+    ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1
 }
 
-define <8 x i8> @load_factor5(ptr %ptr) {
+define {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @load_factor5(ptr %ptr) {
 ; CHECK-LABEL: load_factor5:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v4, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
     %1 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.riscv.seg5.load.mask.v8i8.i64(ptr %ptr, <8 x i1> splat (i1 true), i64 8)
-    %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-    %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-    %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-    %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-    %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 4
-    ret <8 x i8> %6
+    ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1
 }
 
-define <8 x i8> @load_factor6(ptr %ptr) {
+define {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @load_factor6(ptr %ptr) {
 ; CHECK-LABEL: load_factor6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v3, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
     %1 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.riscv.seg6.load.mask.v8i8.i64(ptr %ptr, <8 x i1> splat (i1 true), i64 8)
-    %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-    %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-    %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-    %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-    %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 4
-    %7 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 5
-    ret <8 x i8> %7
+    ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1
 }
 
-define <8 x i8> @load_factor7(ptr %ptr) {
+define {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @load_factor7(ptr %ptr) {
 ; CHECK-LABEL: load_factor7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v2, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
     %1 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.riscv.seg7.load.mask.v8i8.i64(ptr %ptr, <8 x i1> splat (i1 true), i64 8)
-    %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-    %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-    %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-    %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-    %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 4
-    %7 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 5
-    %8 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 6
-    ret <8 x i8> %8
+    ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1
 }
 
-define <8 x i8> @load_factor8(ptr %ptr) {
+define {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @load_factor8(ptr %ptr) {
 ; CHECK-LABEL: load_factor8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v1, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
     %1 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.riscv.seg8.load.mask.v8i8.i64(ptr %ptr, <8 x i1> splat (i1 true), i64 8)
-    %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-    %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-    %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-    %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-    %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 4
-    %7 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 5
-    %8 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 6
-    %9 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 7
-    ret <8 x i8> %9
+    ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index f9f0aa6..fba592d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -372,7 +372,7 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deint
 ; CHECK-LABEL: vector_deinterleave_load_factor3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 24 x i8>, ptr %p
   %d0 = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave3(<vscale x 24 x i8> %vec)
@@ -380,8 +380,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deint
   %t1 = extractvalue {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %d0, 1
   %t2 = extractvalue {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %d0, 2
   %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> %t0, 0
-  %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 0
-  %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 0
+  %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
+  %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
   ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2
 }
 
@@ -634,3 +634,19 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i
   %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
 }
+
+define { <8 x float>, <8 x float> } @deinterleave_unrelated(<16 x float> %arg) {
+; CHECK-LABEL: deinterleave_unrelated:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vfabs.v v12, v8
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vnsrl.wx v10, v12, a0
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    ret
+entry:
+  %abs = call <16 x float> @llvm.fabs(<16 x float> %arg)
+  %res = call { <8 x float>, <8 x float> } @llvm.vector.deinterleave2.v16f32(<16 x float> %abs)
+  ret { <8 x float>, <8 x float> } %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 0a96e4f..5b1746d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -3744,3 +3744,61 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive2(<vsc
   %ext = extractvalue {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res, 5
   ret <vscale x 1 x float> %ext
 }
+
+
+define { <8 x float>, <8 x float> } @interleave_deinterleave2(<8 x float> %a, <8 x float> %b) {
+; V-LABEL: interleave_deinterleave2:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; V-NEXT:    vwaddu.vv v12, v8, v10
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v12, a0, v10
+; V-NEXT:    li a0, 32
+; V-NEXT:    vnsrl.wx v10, v12, a0
+; V-NEXT:    vnsrl.wi v8, v12, 0
+; V-NEXT:    ret
+;
+; ZIP-LABEL: interleave_deinterleave2:
+; ZIP:       # %bb.0: # %entry
+; ZIP-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; ZIP-NEXT:    vmv2r.v v12, v10
+; ZIP-NEXT:    li a0, 32
+; ZIP-NEXT:    ri.vzip2a.vv v16, v8, v12
+; ZIP-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; ZIP-NEXT:    vnsrl.wx v10, v16, a0
+; ZIP-NEXT:    vnsrl.wi v8, v16, 0
+; ZIP-NEXT:    ret
+entry:
+  %0 = call <16 x float> @llvm.vector.interleave2.v16f32(<8 x float> %a, <8 x float> %b)
+  %1 = call { <8 x float>, <8 x float> } @llvm.vector.deinterleave2.v16f32(<16 x float> %0)
+  ret { <8 x float>, <8 x float> } %1
+}
+
+define <16 x float> @deinterleave_interleave2(<16 x float> %arg) {
+; V-LABEL: deinterleave_interleave2:
+; V:       # %bb.0: # %entry
+; V-NEXT:    li a0, 32
+; V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; V-NEXT:    vnsrl.wi v12, v8, 0
+; V-NEXT:    vnsrl.wx v14, v8, a0
+; V-NEXT:    vwaddu.vv v8, v12, v14
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v8, a0, v14
+; V-NEXT:    ret
+;
+; ZIP-LABEL: deinterleave_interleave2:
+; ZIP:       # %bb.0: # %entry
+; ZIP-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; ZIP-NEXT:    vnsrl.wi v12, v8, 0
+; ZIP-NEXT:    li a0, 32
+; ZIP-NEXT:    vnsrl.wx v16, v8, a0
+; ZIP-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; ZIP-NEXT:    ri.vzip2a.vv v8, v12, v16
+; ZIP-NEXT:    ret
+entry:
+  %0 = call { <8 x float>, <8 x float> } @llvm.vector.deinterleave2.v16f32(<16 x float> %arg)
+  %a = extractvalue { <8 x float>, <8 x float> } %0, 0
+  %b = extractvalue { <8 x float>, <8 x float> } %0, 1
+  %res = call <16 x float> @llvm.vector.interleave2.v16f32(<8 x float> %a, <8 x float> %b)
+  ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll
index e6a98c9..eb3422d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll
@@ -2,4246 +2,3303 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i8> @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i8> @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i8> @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i8> @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 16 x i1>, i32, i32, i32)
-
-define <vscale x 16 x i8> @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v6, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 16 x i8> @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 32 x i1>, i32, i32, i32)
-
-define <vscale x 32 x i8> @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v4, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 32 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 32 x i8> @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 32 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 32 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 32 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 32 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i8> @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i8> @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i8> @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i8> @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 16 x i1>, i32, i32, i32)
-
-define <vscale x 16 x i8> @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 16 x i8> @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i8> @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i8> @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i8> @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i8> @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 16 x i1>, i32, i32, i32)
-
-define <vscale x 16 x i8> @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v6, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 16 x i8> @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i8> @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i8> @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i8> @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i8> @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i8> @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i8> @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i8> @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i8> @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i8> @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i8> @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i8> @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i8> @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i8> @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i8> @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i8> @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, i32, i32)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i8> @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i16> @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i16> @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i16> @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i16> @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 8 x i16> @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 16 x i1>, i32, i32, i32)
-
-define <vscale x 16 x i16> @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 16 x i16> @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i16> @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i16> @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i16> @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i16> @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 8 x i16> @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i16> @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i16> @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i16> @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i16> @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 8 x i16> @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i16> @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i16> @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i16> @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i16> @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i16> @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i16> @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i16> @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i16> @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i16> @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i16> @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i16> @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i16> @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i32> @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i32> @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i32> @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 4 x i32> @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 8 x i1>, i32, i32, i32)
-
-define <vscale x 8 x i32> @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 8 x i32> @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i32> @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i32> @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i32> @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 4 x i32> @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i32> @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i32> @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i32> @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 4 x i32> @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i32> @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i32> @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i32> @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i32> @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i32> @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i32> @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i32> @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i32> @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i64> @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i64> @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 2 x i64> @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32)
-
-define <vscale x 4 x i64> @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 4 x i64> @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i64> @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i64> @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 2 x i64> @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i64> @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32)
-
-define <vscale x 2 x i64> @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 2 x i64> @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i64> @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0)
+; CHECK-NEXT:    vlseg5e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i64> @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0)
+; CHECK-NEXT:    vlseg6e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i64> @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0)
+; CHECK-NEXT:    vlseg7e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32)
-
-define <vscale x 1 x i64> @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0)
+; CHECK-NEXT:    vlseg8e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 1 x half> @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 2 x half> @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 4 x half> @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 8 x half> @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 8 x half> @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 16 x half> @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 16 x half> @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 1 x half> @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 2 x half> @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 4 x half> @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 8 x half> @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 8 x half> @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 1 x half> @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 2 x half> @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 4 x half> @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 8 x half> @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 8 x half> @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 1 x half> @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 2 x half> @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 4 x half> @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 1 x half> @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 2 x half> @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 4 x half> @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 1 x half> @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 2 x half> @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 4 x half> @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 1 x half> @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 2 x half> @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 4 x half> @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 1 x float> @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 2 x float> @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x float> @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 4 x float> @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 8 x float> @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 8 x float> @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 1 x float> @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 2 x float> @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 4 x float> @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 4 x float> @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 1 x float> @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 2 x float> @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 4 x float> @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 4 x float> @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 1 x float> @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 2 x float> @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 1 x float> @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 2 x float> @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 1 x float> @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 2 x float> @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 1 x float> @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 2 x float> @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 1 x double> @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x double> @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 2 x double> @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x double> @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 4 x double> @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 1 x double> @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x double> @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 2 x double> @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 1 x double> @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x double> @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 2 x double> @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0)
+; CHECK-NEXT:    vlseg5e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 1 x double> @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0)
+; CHECK-NEXT:    vlseg6e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 1 x double> @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0)
+; CHECK-NEXT:    vlseg7e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 1 x double> @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0)
+; CHECK-NEXT:    vlseg8e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 1 x double> @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 8 x bfloat> @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 8 x bfloat> @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 16 x bfloat> @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 16 x bfloat> @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 8 x bfloat> @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 8 x bfloat> @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 8 x bfloat> @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 8 x bfloat> @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll
index 16e5e7b9..faeabaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll
@@ -2,4330 +2,3373 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i8> @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg2_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i8> @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i8> @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i8> @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 16 x i1>, i64, i64, i64)
-
-define <vscale x 16 x i8> @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v6, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 16 x i8> @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 32 x i1>, i64, i64, i64)
-
-define <vscale x 32 x i8> @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v4, (a0)
+; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 32 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 32 x i8> @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 32 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 32 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vlseg2e8.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 32 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 32 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i8> @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg3_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i8> @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i8> @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i8> @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 16 x i1>, i64, i64, i64)
-
-define <vscale x 16 x i8> @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v6, (a0)
+; CHECK-NEXT:    vlseg3e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 16 x i8> @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg3e8.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i8> @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg4_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i8> @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i8> @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i8> @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 16 x i1>, i64, i64, i64)
-
-define <vscale x 16 x i8> @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v6, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 16 x i8> @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 16 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i8> @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg5_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i8> @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i8> @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i8> @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0)
+; CHECK-NEXT:    vlseg5e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg5e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i8> @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg6_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i8> @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i8> @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i8> @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0)
+; CHECK-NEXT:    vlseg6e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i8> @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg7_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i8> @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i8> @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i8> @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0)
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg7e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i8> @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0
 }
-
-define <vscale x 1 x i8> @test_vlseg8_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i8> @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 2 x i8> @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i8> @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 4 x i8> @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, i64, i64)
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i8> @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0)
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 8 x i8> @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e8.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3)
-  %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 8 x i8> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i16> @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i16> @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i16> @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i16> @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 8 x i16> @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 16 x i1>, i64, i64, i64)
-
-define <vscale x 16 x i16> @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 16 x i16> @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i16> @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i16> @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i16> @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i16> @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 8 x i16> @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i16> @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i16> @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i16> @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i16> @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 8 x i16> @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i16> @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i16> @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i16> @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i16> @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i16> @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i16> @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i16> @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i16> @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i16> @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i16> @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 1 x i16> @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i16> @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 2 x i16> @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i16> @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 4 x i16> @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x i16> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i32> @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i32> @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i32> @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 4 x i32> @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 8 x i1>, i64, i64, i64)
-
-define <vscale x 8 x i32> @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 8 x i32> @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i32> @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i32> @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i32> @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 4 x i32> @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i32> @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i32> @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i32> @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 4 x i32> @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i32> @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i32> @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i32> @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i32> @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i32> @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i32> @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i32> @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 1 x i32> @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i32> @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 2 x i32> @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x i32> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i64> @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i64> @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 2 x i64> @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64)
-
-define <vscale x 4 x i64> @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 4 x i64> @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i64> @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i64> @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 2 x i64> @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i64> @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64)
-
-define <vscale x 2 x i64> @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 2 x i64> @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i64> @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0)
+; CHECK-NEXT:    vlseg5e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i64> @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0)
+; CHECK-NEXT:    vlseg6e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i64> @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0)
+; CHECK-NEXT:    vlseg7e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64)
-
-define <vscale x 1 x i64> @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0)
+; CHECK-NEXT:    vlseg8e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 1 x i64> @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x i64> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 1 x half> @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 2 x half> @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 4 x half> @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 8 x half> @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 8 x half> @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 16 x half> @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 16 x half> @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 1 x half> @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 2 x half> @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 4 x half> @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 8 x half> @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 8 x half> @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 1 x half> @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 2 x half> @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 4 x half> @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 8 x half> @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 8 x half> @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 1 x half> @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 2 x half> @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 4 x half> @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 1 x half> @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 2 x half> @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 4 x half> @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 1 x half> @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 2 x half> @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 4 x half> @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x half> @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 1 x half> @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-
-define <vscale x 2 x half> @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 2 x half> @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-
-define <vscale x 4 x half> @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 4 x half> @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x half> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 1 x float> @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 2 x float> @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x float> @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 4 x float> @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 8 x float> @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0)
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 8 x float> @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vlseg2e32.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 1 x float> @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 2 x float> @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 4 x float> @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0)
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 4 x float> @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg3e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 1 x float> @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 2 x float> @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 4 x float> @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0)
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 4 x float> @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vlseg4e32.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 1 x float> @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0)
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 2 x float> @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 1 x float> @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 2 x float> @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg6e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 1 x float> @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0)
+; CHECK-NEXT:    vlseg7e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 2 x float> @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg7e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x float> @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 1 x float> @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-
-define <vscale x 2 x float> @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0)
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 2 x float> @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5)
-  %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x float> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 1 x double> @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x double> @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 2 x double> @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x double> @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0)
+; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 4 x double> @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vlseg2e64.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 1 x double> @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x double> @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0)
+; CHECK-NEXT:    vlseg3e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 2 x double> @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg3e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 1 x double> @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x double> @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0)
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 2 x double> @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vlseg4e64.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0)
+; CHECK-NEXT:    vlseg5e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 1 x double> @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0)
+; CHECK-NEXT:    vlseg6e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 1 x double> @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg6e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0)
+; CHECK-NEXT:    vlseg7e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 1 x double> @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg7e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x double> @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0)
+; CHECK-NEXT:    vlseg8e64.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 1 x double> @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg8e64.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e64.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6)
-  %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x double> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0
 }
-
-
-define <vscale x 8 x bfloat> @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-define <vscale x 8 x bfloat> @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0
 }
-
-
-define <vscale x 16 x bfloat> @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0)
+; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-define <vscale x 16 x bfloat> @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16.v v4, (a0), v0.t
+; CHECK-NEXT:    vlseg2e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1)
-  ret <vscale x 16 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0
 }
-
-
-define <vscale x 8 x bfloat> @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0)
+; CHECK-NEXT:    vlseg3e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-define <vscale x 8 x bfloat> @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg3e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg3e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0
 }
-
-
-define <vscale x 8 x bfloat> @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0)
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-define <vscale x 8 x bfloat> @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vlseg4e16.v v6, (a0), v0.t
+; CHECK-NEXT:    vlseg4e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1)
-  ret <vscale x 8 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0)
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg5e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0)
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg6e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg6e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0)
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg7e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg7e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0
 }
-
-
-define <vscale x 1 x bfloat> @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-define <vscale x 1 x bfloat> @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1)
-  ret <vscale x 1 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0
 }
-
-
-define <vscale x 2 x bfloat> @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-define <vscale x 2 x bfloat> @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1)
-  ret <vscale x 2 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0
 }
-
-
-define <vscale x 4 x bfloat> @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0)
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
-define <vscale x 4 x bfloat> @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
+define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg8e16.v v7, (a0), v0.t
+; CHECK-NEXT:    vlseg8e16.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4)
-  %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1)
-  ret <vscale x 4 x bfloat> %1
+  ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0
 }
-
diff --git a/llvm/test/CodeGen/WebAssembly/libcall_vectorized.ll b/llvm/test/CodeGen/WebAssembly/libcall_vectorized.ll
new file mode 100644
index 0000000..2d1056f
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/libcall_vectorized.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers  -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare <4 x float> @llvm.exp10.v4f32(<4 x float>)
+
+define <4 x float> @exp10_f32v4(<4 x float> %v) {
+; CHECK-LABEL: exp10_f32v4:
+; CHECK:         .functype exp10_f32v4 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get $push12=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push0=, $pop12, 0
+; CHECK-NEXT:    call $push1=, exp10f, $pop0
+; CHECK-NEXT:    f32x4.splat $push2=, $pop1
+; CHECK-NEXT:    local.get $push13=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push3=, $pop13, 1
+; CHECK-NEXT:    call $push4=, exp10f, $pop3
+; CHECK-NEXT:    f32x4.replace_lane $push5=, $pop2, 1, $pop4
+; CHECK-NEXT:    local.get $push14=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push6=, $pop14, 2
+; CHECK-NEXT:    call $push7=, exp10f, $pop6
+; CHECK-NEXT:    f32x4.replace_lane $push8=, $pop5, 2, $pop7
+; CHECK-NEXT:    local.get $push15=, 0
+; CHECK-NEXT:    f32x4.extract_lane $push9=, $pop15, 3
+; CHECK-NEXT:    call $push10=, exp10f, $pop9
+; CHECK-NEXT:    f32x4.replace_lane $push11=, $pop8, 3, $pop10
+; CHECK-NEXT:    return $pop11
+entry:
+  %r = call <4 x float> @llvm.exp10.v4f32(<4 x float> %v)
+  ret <4 x float> %r
+}
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-alloca.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-alloca.ll
new file mode 100644
index 0000000..0f968de
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-alloca.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -wasm-lower-em-ehsjlj -wasm-enable-sjlj -mtriple=wasm32-unknown-emscripten < %s | FileCheck %s
+
+@buf = external global i8
+declare i32 @setjmp(ptr) returns_twice
+declare void @dummy()
+
+define void @test_static() {
+; CHECK-LABEL: define void @test_static() personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[FUNCTIONINVOCATIONID:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label %[[SETJMP_DISPATCH:.*]]
+; CHECK:       [[SETJMP_DISPATCH]]:
+; CHECK-NEXT:    [[VAL1:%.*]] = phi i32 [ [[VAL:%.*]], %[[IF_END:.*]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[LABEL_PHI:%.*]] = phi i32 [ [[LABEL:%.*]], %[[IF_END]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    switch i32 [[LABEL_PHI]], label %[[ENTRY_SPLIT:.*]] [
+; CHECK-NEXT:      i32 1, label %[[ENTRY_SPLIT_SPLIT:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[X]])
+; CHECK-NEXT:    call void @__wasm_setjmp(ptr @buf, i32 1, ptr [[FUNCTIONINVOCATIONID]])
+; CHECK-NEXT:    br label %[[ENTRY_SPLIT_SPLIT]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT]]:
+; CHECK-NEXT:    [[SETJMP_RET:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[VAL1]], %[[SETJMP_DISPATCH]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SETJMP_RET]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    invoke void @dummy()
+; CHECK-NEXT:            to [[DOTNOEXC:label %.*]] unwind label %[[CATCH_DISPATCH_LONGJMP:.*]]
+; CHECK:       [[_NOEXC:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[CATCH_DISPATCH_LONGJMP]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch.longjmp] unwind to caller
+; CHECK:       [[CATCH_LONGJMP:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] []
+; CHECK-NEXT:    [[THROWN:%.*]] = call ptr @llvm.wasm.catch(i32 1)
+; CHECK-NEXT:    [[ENV_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 0
+; CHECK-NEXT:    [[VAL_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 1
+; CHECK-NEXT:    [[ENV:%.*]] = load ptr, ptr [[ENV_GEP]], align 4
+; CHECK-NEXT:    [[VAL]] = load i32, ptr [[VAL_GEP]], align 4
+; CHECK-NEXT:    [[LABEL]] = call i32 @__wasm_setjmp_test(ptr [[ENV]], ptr [[FUNCTIONINVOCATIONID]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[LABEL]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @__wasm_longjmp(ptr [[ENV]], i32 [[VAL]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    catchret from [[TMP1]] to label %[[SETJMP_DISPATCH]]
+;
+entry:
+  %x = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %x)
+  %call = call i32 @setjmp(ptr @buf) returns_twice
+  %cmp = icmp eq i32 %call, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  call void @dummy()
+  ret void
+
+else:
+  call void @llvm.lifetime.end.p0(i64 4, ptr %x)
+  ret void
+}
+
+define void @test_dynamic(i32 %size) {
+; CHECK-LABEL: define void @test_dynamic(
+; CHECK-SAME: i32 [[SIZE:%.*]]) personality ptr @__gxx_wasm_personality_v0 {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[FUNCTIONINVOCATIONID:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label %[[SETJMP_DISPATCH:.*]]
+; CHECK:       [[SETJMP_DISPATCH]]:
+; CHECK-NEXT:    [[VAL1:%.*]] = phi i32 [ [[VAL:%.*]], %[[IF_END:.*]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[LABEL_PHI:%.*]] = phi i32 [ [[LABEL:%.*]], %[[IF_END]] ], [ -1, %[[ENTRY]] ]
+; CHECK-NEXT:    switch i32 [[LABEL_PHI]], label %[[ENTRY_SPLIT:.*]] [
+; CHECK-NEXT:      i32 1, label %[[ENTRY_SPLIT_SPLIT:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, i32 [[SIZE]], align 4
+; CHECK-NEXT:    call void @__wasm_setjmp(ptr @buf, i32 1, ptr [[FUNCTIONINVOCATIONID]])
+; CHECK-NEXT:    br label %[[ENTRY_SPLIT_SPLIT]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT]]:
+; CHECK-NEXT:    [[SETJMP_RET:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[VAL1]], %[[SETJMP_DISPATCH]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SETJMP_RET]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    invoke void @dummy()
+; CHECK-NEXT:            to [[DOTNOEXC:label %.*]] unwind label %[[CATCH_DISPATCH_LONGJMP:.*]]
+; CHECK:       [[_NOEXC:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CATCH_DISPATCH_LONGJMP]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch.longjmp] unwind to caller
+; CHECK:       [[CATCH_LONGJMP:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] []
+; CHECK-NEXT:    [[THROWN:%.*]] = call ptr @llvm.wasm.catch(i32 1)
+; CHECK-NEXT:    [[ENV_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 0
+; CHECK-NEXT:    [[VAL_GEP:%.*]] = getelementptr { ptr, i32 }, ptr [[THROWN]], i32 0, i32 1
+; CHECK-NEXT:    [[ENV:%.*]] = load ptr, ptr [[ENV_GEP]], align 4
+; CHECK-NEXT:    [[VAL]] = load i32, ptr [[VAL_GEP]], align 4
+; CHECK-NEXT:    [[LABEL]] = call i32 @__wasm_setjmp_test(ptr [[ENV]], ptr [[FUNCTIONINVOCATIONID]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[LABEL]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @__wasm_longjmp(ptr [[ENV]], i32 [[VAL]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    catchret from [[TMP1]] to label %[[SETJMP_DISPATCH]]
+;
+entry:
+  %x = alloca i32, i32 %size, align 4
+  call void @llvm.lifetime.start.p0(i64 -1, ptr %x)
+  %call = call i32 @setjmp(ptr @buf) returns_twice
+  %cmp = icmp eq i32 %call, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  call void @dummy()
+  ret void
+
+else:
+  call void @llvm.lifetime.end.p0(i64 -1, ptr %x)
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll
index fec9836..bab8403 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj-debuginfo.ll
@@ -16,10 +16,10 @@ entry:
   call void @foo(), !dbg !7
   ret void, !dbg !8
 ; CHECK: entry:
-  ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4, !dbg ![[DL0:.*]]
+  ; CHECK-NEXT: %buf = alloca [1 x %struct.__jmp_buf_tag], align 16, !dbg ![[DL0:.*]]
+  ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4, !dbg ![[DL0]]
 
 ; CHECK: entry.split:
-  ; CHECK: alloca {{.*}}, !dbg ![[DL0]]
   ; CHECK: call void @__wasm_setjmp{{.*}}, !dbg ![[DL1:.*]]
   ; CHECK-NEXT: br {{.*}}, !dbg ![[DL2:.*]]
 
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
index b584342..51dcf2f 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -22,17 +22,17 @@ entry:
   call void @longjmp(ptr %buf, i32 1) #1
   unreachable
 ; CHECK: entry:
+; CHECK-NEXT:  %buf = alloca [1 x %struct.__jmp_buf_tag], align 16
 ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4
 ; CHECK-NEXT: br label %entry.split
 
 ; CHECK: entry.split
-; CHECK-NEXT: %[[BUF:.*]] = alloca [1 x %struct.__jmp_buf_tag]
-; CHECK-NEXT: call void @__wasm_setjmp(ptr %[[BUF]], i32 1, ptr %functionInvocationId)
+; CHECK-NEXT: call void @__wasm_setjmp(ptr %buf, i32 1, ptr %functionInvocationId)
 ; CHECK-NEXT: br label %entry.split.split
 
 ; CHECK: entry.split.split:
 ; CHECK-NEXT: phi i32 [ 0, %entry.split ], [ %[[LONGJMP_RESULT:.*]], %if.end ]
-; CHECK-NEXT: %[[JMPBUF:.*]] = ptrtoint ptr %[[BUF]] to [[PTR]]
+; CHECK-NEXT: %[[JMPBUF:.*]] = ptrtoint ptr %buf to [[PTR]]
 ; CHECK-NEXT: store [[PTR]] 0, ptr @__THREW__
 ; CHECK-NEXT: call cc{{.*}} void @__invoke_void_[[PTR]]_i32(ptr @emscripten_longjmp, [[PTR]] %[[JMPBUF]], i32 1)
 ; CHECK-NEXT: %[[__THREW__VAL:.*]] = load [[PTR]], ptr @__THREW__
diff --git a/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll
index b4c93c4..9de6652 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-wasm-ehsjlj.ll
@@ -108,7 +108,7 @@ catch:                                            ; preds = %catch.start
   call void @__cxa_end_catch() [ "funclet"(token %2) ]
   catchret from %2 to label %catchret.dest
 ; CHECK: catch:                                            ; preds = %catch.start
-; CHECK-NEXT:   %exn = load ptr, ptr %exn.slot6, align 4
+; CHECK-NEXT:   %exn = load ptr, ptr %exn.slot, align 4
 ; CHECK-NEXT:   %5 = call ptr @__cxa_begin_catch(ptr %exn) #3 [ "funclet"(token %2) ]
 ; CHECK-NEXT:   invoke void @__cxa_end_catch() [ "funclet"(token %2) ]
 ; CHECK-NEXT:           to label %.noexc unwind label %catch.dispatch.longjmp
diff --git a/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll b/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
index 82c04e2..e1cb859 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-wasm-sjlj.ll
@@ -25,26 +25,24 @@ entry:
   unreachable
 
 ; CHECK:    entry:
+; CHECK-NEXT: %buf = alloca [1 x %struct.__jmp_buf_tag], align 16
 ; CHECK-NEXT: %functionInvocationId = alloca i32, align 4
 ; CHECK-NEXT: br label %setjmp.dispatch
 
 ; CHECK:    setjmp.dispatch:
 ; CHECK-NEXT: %[[VAL2:.*]] = phi i32 [ %val, %if.end ], [ undef, %entry ]
-; CHECK-NEXT: %[[BUF:.*]] = phi ptr [ %[[BUF2:.*]], %if.end ], [ undef, %entry ]
 ; CHECK-NEXT: %label.phi = phi i32 [ %label, %if.end ], [ -1, %entry ]
 ; CHECK-NEXT: switch i32 %label.phi, label %entry.split [
 ; CHECK-NEXT:   i32 1, label %entry.split.split
 ; CHECK-NEXT: ]
 
 ; CHECK:    entry.split:
-; CHECK-NEXT: %buf = alloca [1 x %struct.__jmp_buf_tag], align 16
 ; CHECK-NEXT: call void @__wasm_setjmp(ptr %buf, i32 1, ptr %functionInvocationId)
 ; CHECK-NEXT: br label %entry.split.split
 
 ; CHECK:    entry.split.split:
-; CHECK-NEXT: %[[BUF2]] = phi ptr [ %[[BUF]], %setjmp.dispatch ], [ %buf, %entry.split ]
 ; CHECK-NEXT: %setjmp.ret = phi i32 [ 0, %entry.split ], [ %[[VAL2]], %setjmp.dispatch ]
-; CHECK-NEXT: invoke void @__wasm_longjmp(ptr %[[BUF2]], i32 1)
+; CHECK-NEXT: invoke void @__wasm_longjmp(ptr %buf, i32 1)
 ; CHECK-NEXT:         to label %.noexc unwind label %catch.dispatch.longjmp
 
 ; CHECK:    .noexc:
diff --git a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
index e4014ba..ea2453f 100644
--- a/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
+++ b/llvm/test/CodeGen/WebAssembly/ref-test-func.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s --mtriple=wasm32-unknown-unknown -mcpu=mvp -mattr=+reference-types -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK32 %s
-; RUN: llc < %s --mtriple=wasm64-unknown-unknown -mcpu=mvp -mattr=+reference-types -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK64 %s
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -mcpu=mvp -mattr=+reference-types -mattr=+gc -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK32 %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -mcpu=mvp -mattr=+reference-types -mattr=+gc -verify-machineinstrs | FileCheck --check-prefixes CHECK,CHK64 %s
 
 define void @test_fpsig_void_void(ptr noundef %func) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_fpsig_void_void:
diff --git a/llvm/test/CodeGen/WebAssembly/returned.ll b/llvm/test/CodeGen/WebAssembly/returned.ll
index e767e29..aef75d8 100644
--- a/llvm/test/CodeGen/WebAssembly/returned.ll
+++ b/llvm/test/CodeGen/WebAssembly/returned.ll
@@ -80,3 +80,27 @@ define i32 @test_second_arg(i32 %a, i32 %b) {
     %call = call i32 @do_something_else(i32 %a, i32 %b)
     ret i32 %b
 }
+
+define void @test() {
+; CHECK-LABEL: test:
+; CHECK:         .functype test () -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    global.get $push0=, __stack_pointer
+; CHECK-NEXT:    i32.const $push1=, 16
+; CHECK-NEXT:    i32.sub $push7=, $pop0, $pop1
+; CHECK-NEXT:    local.tee $push6=, $0=, $pop7
+; CHECK-NEXT:    global.set __stack_pointer, $pop6
+; CHECK-NEXT:    i32.const $push4=, 12
+; CHECK-NEXT:    i32.add $push5=, $0, $pop4
+; CHECK-NEXT:    call $drop=, returns_arg, $pop5
+; CHECK-NEXT:    i32.const $push2=, 16
+; CHECK-NEXT:    i32.add $push3=, $0, $pop2
+; CHECK-NEXT:    global.set __stack_pointer, $pop3
+; CHECK-NEXT:    return
+entry:
+  %a = alloca i32
+  call void @llvm.lifetime.start.p0(i64 4, ptr %a)
+  %ret = call ptr @returns_arg(ptr %a)
+  call void @llvm.lifetime.end.p0(i64 4, ptr %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index e3607e1..36637e1 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -199,139 +199,17 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SIMD128-LABEL: mul_v16i8:
 ; SIMD128:         .functype mul_v16i8 (v128, v128) -> (v128)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.extract_lane_u $push4=, $0, 0
-; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $1, 0
-; SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT:    i8x16.splat $push6=, $pop5
-; SIMD128-NEXT:    i8x16.extract_lane_u $push1=, $0, 1
-; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
-; SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT:    i8x16.replace_lane $push7=, $pop6, 1, $pop2
-; SIMD128-NEXT:    i8x16.extract_lane_u $push9=, $0, 2
-; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $1, 2
-; SIMD128-NEXT:    i32.mul $push10=, $pop9, $pop8
-; SIMD128-NEXT:    i8x16.replace_lane $push11=, $pop7, 2, $pop10
-; SIMD128-NEXT:    i8x16.extract_lane_u $push13=, $0, 3
-; SIMD128-NEXT:    i8x16.extract_lane_u $push12=, $1, 3
-; SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; SIMD128-NEXT:    i8x16.replace_lane $push15=, $pop11, 3, $pop14
-; SIMD128-NEXT:    i8x16.extract_lane_u $push17=, $0, 4
-; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $1, 4
-; SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
-; SIMD128-NEXT:    i8x16.replace_lane $push19=, $pop15, 4, $pop18
-; SIMD128-NEXT:    i8x16.extract_lane_u $push21=, $0, 5
-; SIMD128-NEXT:    i8x16.extract_lane_u $push20=, $1, 5
-; SIMD128-NEXT:    i32.mul $push22=, $pop21, $pop20
-; SIMD128-NEXT:    i8x16.replace_lane $push23=, $pop19, 5, $pop22
-; SIMD128-NEXT:    i8x16.extract_lane_u $push25=, $0, 6
-; SIMD128-NEXT:    i8x16.extract_lane_u $push24=, $1, 6
-; SIMD128-NEXT:    i32.mul $push26=, $pop25, $pop24
-; SIMD128-NEXT:    i8x16.replace_lane $push27=, $pop23, 6, $pop26
-; SIMD128-NEXT:    i8x16.extract_lane_u $push29=, $0, 7
-; SIMD128-NEXT:    i8x16.extract_lane_u $push28=, $1, 7
-; SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; SIMD128-NEXT:    i8x16.replace_lane $push31=, $pop27, 7, $pop30
-; SIMD128-NEXT:    i8x16.extract_lane_u $push33=, $0, 8
-; SIMD128-NEXT:    i8x16.extract_lane_u $push32=, $1, 8
-; SIMD128-NEXT:    i32.mul $push34=, $pop33, $pop32
-; SIMD128-NEXT:    i8x16.replace_lane $push35=, $pop31, 8, $pop34
-; SIMD128-NEXT:    i8x16.extract_lane_u $push37=, $0, 9
-; SIMD128-NEXT:    i8x16.extract_lane_u $push36=, $1, 9
-; SIMD128-NEXT:    i32.mul $push38=, $pop37, $pop36
-; SIMD128-NEXT:    i8x16.replace_lane $push39=, $pop35, 9, $pop38
-; SIMD128-NEXT:    i8x16.extract_lane_u $push41=, $0, 10
-; SIMD128-NEXT:    i8x16.extract_lane_u $push40=, $1, 10
-; SIMD128-NEXT:    i32.mul $push42=, $pop41, $pop40
-; SIMD128-NEXT:    i8x16.replace_lane $push43=, $pop39, 10, $pop42
-; SIMD128-NEXT:    i8x16.extract_lane_u $push45=, $0, 11
-; SIMD128-NEXT:    i8x16.extract_lane_u $push44=, $1, 11
-; SIMD128-NEXT:    i32.mul $push46=, $pop45, $pop44
-; SIMD128-NEXT:    i8x16.replace_lane $push47=, $pop43, 11, $pop46
-; SIMD128-NEXT:    i8x16.extract_lane_u $push49=, $0, 12
-; SIMD128-NEXT:    i8x16.extract_lane_u $push48=, $1, 12
-; SIMD128-NEXT:    i32.mul $push50=, $pop49, $pop48
-; SIMD128-NEXT:    i8x16.replace_lane $push51=, $pop47, 12, $pop50
-; SIMD128-NEXT:    i8x16.extract_lane_u $push53=, $0, 13
-; SIMD128-NEXT:    i8x16.extract_lane_u $push52=, $1, 13
-; SIMD128-NEXT:    i32.mul $push54=, $pop53, $pop52
-; SIMD128-NEXT:    i8x16.replace_lane $push55=, $pop51, 13, $pop54
-; SIMD128-NEXT:    i8x16.extract_lane_u $push57=, $0, 14
-; SIMD128-NEXT:    i8x16.extract_lane_u $push56=, $1, 14
-; SIMD128-NEXT:    i32.mul $push58=, $pop57, $pop56
-; SIMD128-NEXT:    i8x16.replace_lane $push59=, $pop55, 14, $pop58
-; SIMD128-NEXT:    i8x16.extract_lane_u $push61=, $0, 15
-; SIMD128-NEXT:    i8x16.extract_lane_u $push60=, $1, 15
-; SIMD128-NEXT:    i32.mul $push62=, $pop61, $pop60
-; SIMD128-NEXT:    i8x16.replace_lane $push63=, $pop59, 15, $pop62
-; SIMD128-NEXT:    return $pop63
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push1=, $0, $1
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    return $pop2
 ;
 ; SIMD128-FAST-LABEL: mul_v16i8:
 ; SIMD128-FAST:         .functype mul_v16i8 (v128, v128) -> (v128)
 ; SIMD128-FAST-NEXT:  # %bb.0:
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push5=, $0, 0
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push4=, $1, 0
-; SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; SIMD128-FAST-NEXT:    i8x16.splat $push7=, $pop6
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push2=, $0, 1
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push1=, $1, 1
-; SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push8=, $pop7, 1, $pop3
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push10=, $0, 2
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push9=, $1, 2
-; SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push12=, $pop8, 2, $pop11
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push14=, $0, 3
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push13=, $1, 3
-; SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push16=, $pop12, 3, $pop15
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push18=, $0, 4
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push17=, $1, 4
-; SIMD128-FAST-NEXT:    i32.mul $push19=, $pop18, $pop17
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push20=, $pop16, 4, $pop19
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push22=, $0, 5
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push21=, $1, 5
-; SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push24=, $pop20, 5, $pop23
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push26=, $0, 6
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push25=, $1, 6
-; SIMD128-FAST-NEXT:    i32.mul $push27=, $pop26, $pop25
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push28=, $pop24, 6, $pop27
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push30=, $0, 7
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push29=, $1, 7
-; SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push32=, $pop28, 7, $pop31
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push34=, $0, 8
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push33=, $1, 8
-; SIMD128-FAST-NEXT:    i32.mul $push35=, $pop34, $pop33
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push36=, $pop32, 8, $pop35
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push38=, $0, 9
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push37=, $1, 9
-; SIMD128-FAST-NEXT:    i32.mul $push39=, $pop38, $pop37
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push40=, $pop36, 9, $pop39
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push42=, $0, 10
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push41=, $1, 10
-; SIMD128-FAST-NEXT:    i32.mul $push43=, $pop42, $pop41
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push44=, $pop40, 10, $pop43
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push46=, $0, 11
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push45=, $1, 11
-; SIMD128-FAST-NEXT:    i32.mul $push47=, $pop46, $pop45
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push48=, $pop44, 11, $pop47
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push50=, $0, 12
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push49=, $1, 12
-; SIMD128-FAST-NEXT:    i32.mul $push51=, $pop50, $pop49
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push52=, $pop48, 12, $pop51
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push54=, $0, 13
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push53=, $1, 13
-; SIMD128-FAST-NEXT:    i32.mul $push55=, $pop54, $pop53
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push56=, $pop52, 13, $pop55
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push58=, $0, 14
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push57=, $1, 14
-; SIMD128-FAST-NEXT:    i32.mul $push59=, $pop58, $pop57
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push60=, $pop56, 14, $pop59
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push62=, $0, 15
-; SIMD128-FAST-NEXT:    i8x16.extract_lane_u $push61=, $1, 15
-; SIMD128-FAST-NEXT:    i32.mul $push63=, $pop62, $pop61
-; SIMD128-FAST-NEXT:    i8x16.replace_lane $push0=, $pop60, 15, $pop63
+; SIMD128-FAST-NEXT:    i16x8.extmul_low_i8x16_u $push2=, $0, $1
+; SIMD128-FAST-NEXT:    i16x8.extmul_high_i8x16_u $push1=, $0, $1
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push0=, $pop2, $pop1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ; SIMD128-FAST-NEXT:    return $pop0
 ;
 ; NO-SIMD128-LABEL: mul_v16i8:
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
new file mode 100644
index 0000000..6e2d860
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,              | FileCheck %s --check-prefix=STRICT
+
+target triple = "wasm32"
+
+define double @fsub_fmul_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fsub_fmul_contract_f64:
+; RELAXED:         .functype fsub_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $1, $0
+; RELAXED-NEXT:    f64.sub $push1=, $2, $pop0
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fsub_fmul_contract_f64:
+; STRICT:         .functype fsub_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $1, $0
+; STRICT-NEXT:    f64.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract double %b, %a
+  %sub = fsub contract double %c, %mul
+  ret double %sub
+}
+
+define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_4xf32:
+; RELAXED:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fsub_fmul_contract_4xf32:
+; STRICT:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract <4 x float> %b, %a
+  %sub = fsub contract <4 x float> %c, %mul
+  ret <4 x float> %sub
+}
+
+
+define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_8xf16:
+; RELAXED:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fsub_fmul_contract_8xf16:
+; STRICT:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
+; STRICT-NEXT:    f16x8.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract <8 x half> %b, %a
+  %sub = fsub contract <8 x half> %c, %mul
+  ret <8 x half> %sub
+}
+
+
+define <4 x float> @fsub_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fsub_fmul_4xf32:
+; RELAXED:         .functype fsub_fmul_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32x4.sub $push1=, $2, $pop0
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fsub_fmul_4xf32:
+; STRICT:         .functype fsub_fmul_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul <4 x float> %b, %a
+  %sub = fsub contract <4 x float> %c, %mul
+  ret <4 x float> %sub
+}
+
+define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_8xf32:
+; RELAXED:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT:    v128.store 16($0), $pop0
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT:    v128.store 0($0), $pop1
+; RELAXED-NEXT:    return
+;
+; STRICT-LABEL: fsub_fmul_contract_8xf32:
+; STRICT:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $4, $2
+; STRICT-NEXT:    f32x4.sub $push1=, $6, $pop0
+; STRICT-NEXT:    v128.store 16($0), $pop1
+; STRICT-NEXT:    f32x4.mul $push2=, $3, $1
+; STRICT-NEXT:    f32x4.sub $push3=, $5, $pop2
+; STRICT-NEXT:    v128.store 0($0), $pop3
+; STRICT-NEXT:    return
+  %mul = fmul contract <8 x float> %b, %a
+  %sub = fsub contract <8 x float> %c, %mul
+  ret <8 x float> %sub
+}
+
+
+define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fsub_fmul_contract_2xf64:
+; RELAXED:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fsub_fmul_contract_2xf64:
+; STRICT:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
+; STRICT-NEXT:    f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract <2 x double> %b, %a
+  %sub = fsub contract <2 x double> %c, %mul
+  ret <2 x double> %sub
+}
+
+define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fsub_fmul_contract_f32:
+; RELAXED:         .functype fsub_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32.sub $push1=, $2, $pop0
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fsub_fmul_contract_f32:
+; STRICT:         .functype fsub_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $1, $0
+; STRICT-NEXT:    f32.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %mul = fmul contract float %b, %a
+  %sub = fsub contract float %c, %mul
+  ret float %sub
+}
+
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
index 1c77ad5..60cfc27 100644
--- a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
+++ b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
@@ -70,9 +70,9 @@ target triple = "wasm32-unknown-unknown"
 ;                +call-indirect-overlong, +exception-handling,
 ;                +extended-const, +fp16, +multimemory, +multivalue,
 ;                +mutable-globals, +nontrapping-fptoint, +relaxed-simd,
-;                +reference-types, +simd128, +sign-ext, +tail-call
+;                +reference-types, +simd128, +sign-ext, +tail-call, +gc
 ; BLEEDING-EDGE-LABEL: .section  .custom_section.target_features,"",@
-; BLEEDING-EDGE-NEXT: .int8  16
+; BLEEDING-EDGE-NEXT: .int8  17
 ; BLEEDING-EDGE-NEXT: .int8  43
 ; BLEEDING-EDGE-NEXT: .int8  7
 ; BLEEDING-EDGE-NEXT: .ascii  "atomics"
@@ -95,6 +95,9 @@ target triple = "wasm32-unknown-unknown"
 ; BLEEDING-EDGE-NEXT: .int8  4
 ; BLEEDING-EDGE-NEXT: .ascii  "fp16"
 ; BLEEDING-EDGE-NEXT: .int8  43
+; BLEEDING-EDGE-NEXT: .int8  2
+; BLEEDING-EDGE-NEXT: .ascii  "gc"
+; BLEEDING-EDGE-NEXT: .int8  43
 ; BLEEDING-EDGE-NEXT: .int8  11
 ; BLEEDING-EDGE-NEXT: .ascii  "multimemory"
 ; BLEEDING-EDGE-NEXT: .int8  43
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
index 1d194b6..4c30a3a 100644
--- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -116,40 +116,28 @@ define i8 @pairwise_mul_v16i8(<16 x i8> %arg) {
 ; SIMD128-LABEL: pairwise_mul_v16i8:
 ; SIMD128:         .functype pairwise_mul_v16i8 (v128) -> (i32)
 ; SIMD128-NEXT:  # %bb.0:
-; SIMD128-NEXT:    i8x16.extract_lane_u $push26=, $0, 0
-; SIMD128-NEXT:    i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-; SIMD128-NEXT:    local.tee $push31=, $1=, $pop32
-; SIMD128-NEXT:    i8x16.extract_lane_u $push25=, $pop31, 0
-; SIMD128-NEXT:    i32.mul $push27=, $pop26, $pop25
-; SIMD128-NEXT:    i8x16.extract_lane_u $push23=, $0, 4
-; SIMD128-NEXT:    i8x16.extract_lane_u $push22=, $1, 4
-; SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; SIMD128-NEXT:    i32.mul $push28=, $pop27, $pop24
-; SIMD128-NEXT:    i8x16.extract_lane_u $push19=, $0, 2
-; SIMD128-NEXT:    i8x16.extract_lane_u $push18=, $1, 2
-; SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; SIMD128-NEXT:    i8x16.extract_lane_u $push16=, $0, 6
-; SIMD128-NEXT:    i8x16.extract_lane_u $push15=, $1, 6
-; SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
-; SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop17
-; SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop21
-; SIMD128-NEXT:    i8x16.extract_lane_u $push11=, $0, 1
-; SIMD128-NEXT:    i8x16.extract_lane_u $push10=, $1, 1
-; SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $0, 5
-; SIMD128-NEXT:    i8x16.extract_lane_u $push7=, $1, 5
-; SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; SIMD128-NEXT:    i32.mul $push13=, $pop12, $pop9
-; SIMD128-NEXT:    i8x16.extract_lane_u $push4=, $0, 3
-; SIMD128-NEXT:    i8x16.extract_lane_u $push3=, $1, 3
-; SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT:    i8x16.extract_lane_u $push1=, $0, 7
-; SIMD128-NEXT:    i8x16.extract_lane_u $push0=, $1, 7
-; SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop2
-; SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop6
-; SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop14
-; SIMD128-NEXT:    return $pop30
+; SIMD128-NEXT:    i8x16.shuffle $push20=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push19=, $1=, $pop20
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push1=, $0, $pop19
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push18=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    local.tee $push17=, $0=, $pop18
+; SIMD128-NEXT:    i8x16.shuffle $push16=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push15=, $1=, $pop16
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push3=, $pop17, $pop15
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push2=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push14=, $pop3, $pop2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    local.tee $push13=, $0=, $pop14
+; SIMD128-NEXT:    i8x16.shuffle $push12=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    local.tee $push11=, $1=, $pop12
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push5=, $pop13, $pop11
+; SIMD128-NEXT:    i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $pop5, $pop4, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT:    local.tee $push9=, $0=, $pop10
+; SIMD128-NEXT:    i8x16.shuffle $push6=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT:    i16x8.extmul_low_i8x16_u $push7=, $pop9, $pop6
+; SIMD128-NEXT:    i8x16.extract_lane_u $push8=, $pop7, 0
+; SIMD128-NEXT:    return $pop8
   %res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg)
   ret i8 %res
 }
diff --git a/llvm/test/CodeGen/X86/embed-bitcode.ll b/llvm/test/CodeGen/X86/embed-bitcode.ll
index 0d66ba8..d4af954 100644
--- a/llvm/test/CodeGen/X86/embed-bitcode.ll
+++ b/llvm/test/CodeGen/X86/embed-bitcode.ll
@@ -1,10 +1,23 @@
 ; RUN: llc -filetype=obj -mtriple=x86_64 %s -o %t
 ; RUN: llvm-readelf -S %t | FileCheck %s
+; RUN: llc -filetype=obj -mtriple=x86_64-pc-windows-msvc %s -o %t
+; RUN: llvm-readobj -S %t | FileCheck %s --check-prefix=COFF
 
 ; CHECK:      .text    PROGBITS 0000000000000000 [[#%x,OFF:]] 000000 00 AX 0
 ; CHECK-NEXT: .llvmbc  PROGBITS 0000000000000000 [[#%x,OFF:]] 000004 00    0
 ; CHECK-NEXT: .llvmcmd PROGBITS 0000000000000000 [[#%x,OFF:]] 000005 00    0
 
+; COFF:      Name: .llvmbc (2E 6C 6C 76 6D 62 63 00)
+; COFF:      Characteristics [
+; COFF-NEXT:   IMAGE_SCN_ALIGN_1BYTES
+; COFF-NEXT:   IMAGE_SCN_MEM_DISCARDABLE
+; COFF-NEXT: ]
+; COFF:      Name: .llvmcmd (2E 6C 6C 76 6D 63 6D 64)
+; COFF:      Characteristics [
+; COFF-NEXT:   IMAGE_SCN_ALIGN_1BYTES
+; COFF-NEXT:   IMAGE_SCN_MEM_DISCARDABLE
+; COFF-NEXT: ]
+
 @llvm.embedded.module = private constant [4 x i8] c"BC\C0\DE", section ".llvmbc", align 1
 @llvm.cmdline = private constant [5 x i8] c"-cc1\00", section ".llvmcmd", align 1
 @llvm.compiler.used = appending global [2 x ptr] [ptr @llvm.embedded.module, ptr @llvm.cmdline], section "llvm.metadata"
diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
new file mode 100644
index 0000000..960bbf5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -0,0 +1,526 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86-SDAGISEL
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
+; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
+; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
+
+; FIXME: We can reuse/delete llvm/test/CodeGen/X86/is_fpclass.ll when all patches are included.
+
+define i1 @isnone_f(float %x) {
+; X86-SDAGISEL-LABEL: isnone_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    xorl %eax, %eax
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isnone_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isnone_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    xorl %eax, %eax
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
+  ret i1 %0
+}
+
+define i1 @isany_f(float %x) {
+; X86-SDAGISEL-LABEL: isany_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movb $1, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isany_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isany_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    movb $1, %al
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
+  ret i1 %0
+}
+
+define i1 @issignaling_f(float %x) {
+; X86-SDAGISEL-LABEL: issignaling_f:
+; X86-SDAGISEL:       # %bb.0:
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setl %cl
+; X86-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    andb %cl, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: issignaling_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setl %cl
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: issignaling_f:
+; X86-FASTISEL:       # %bb.0:
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-FASTISEL-NEXT:    setl %cl
+; X86-FASTISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-FASTISEL-NEXT:    setge %al
+; X86-FASTISEL-NEXT:    andb %cl, %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+   %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1)  ; "snan"
+   ret i1 %a0
+}
+
+ define i1 @isquiet_f(float %x) {
+; X86-SDAGISEL-LABEL: isquiet_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isquiet_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isquiet_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-FASTISEL-NEXT:    setge %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+ entry:
+   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2)  ; "qnan"
+   ret i1 %0
+}
+
+define i1 @not_isquiet_f(float %x) {
+; X86-SDAGISEL-LABEL: not_isquiet_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setl %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_isquiet_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_isquiet_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-FASTISEL-NEXT:    setl %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021)  ; ~"qnan"
+  ret i1 %0
+}
+
+define i1 @isinf_f(float %x) {
+; X86-SDAGISEL-LABEL: isinf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isinf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isinf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    sete %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
+  ret i1 %0
+}
+
+define i1 @not_isinf_f(float %x) {
+; X86-SDAGISEL-LABEL: not_isinf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setne %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_isinf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_isinf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setne %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; ~0x204 = "~inf"
+  ret i1 %0
+}
+
+define i1 @is_plus_inf_f(float %x) {
+; X86-SDAGISEL-LABEL: is_plus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: is_plus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: is_plus_inf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $2139095040, (%esp) # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    sete %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512)  ; 0x200 = "+inf"
+  ret i1 %0
+}
+
+define i1 @is_minus_inf_f(float %x) {
+; X86-SDAGISEL-LABEL: is_minus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: is_minus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: is_minus_inf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $-8388608, (%esp) # imm = 0xFF800000
+; X86-FASTISEL-NEXT:    sete %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4)  ; "-inf"
+  ret i1 %0
+}
+
+define i1 @not_is_minus_inf_f(float %x) {
+; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-SDAGISEL-NEXT:    setne %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_is_minus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_is_minus_inf_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $-8388608, (%esp) # imm = 0xFF800000
+; X86-FASTISEL-NEXT:    setne %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019)  ; ~"-inf"
+  ret i1 %0
+}
+
+define i1 @isfinite_f(float %x) {
+; X86-SDAGISEL-LABEL: isfinite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setl %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isfinite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isfinite_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setl %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
+  ret i1 %0
+}
+
+define i1 @not_isfinite_f(float %x) {
+; X86-SDAGISEL-LABEL: not_isfinite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: not_isfinite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: not_isfinite_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-FASTISEL-NEXT:    andl (%esp), %eax
+; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setge %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; ~0x1f8 = "~finite"
+  ret i1 %0
+}
+
+define i1 @is_plus_finite_f(float %x) {
+; X86-SDAGISEL-LABEL: is_plus_finite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setb %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: is_plus_finite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: is_plus_finite_f:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    pushl %eax
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstps (%esp)
+; X86-FASTISEL-NEXT:    cmpl $2139095040, (%esp) # imm = 0x7F800000
+; X86-FASTISEL-NEXT:    setb %al
+; X86-FASTISEL-NEXT:    popl %ecx
+; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-FASTISEL-NEXT:    retl
+entry:
+  %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448)  ; 0x1c0 = "+finite"
+  ret i1 %0
+}
+
+define i1 @isnone_d(double %x) nounwind {
+; X86-SDAGISEL-LABEL: isnone_d:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    xorl %eax, %eax
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isnone_d:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isnone_d:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    xorl %eax, %eax
+; X86-FASTISEL-NEXT:    retl
+entry:
+    %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0)
+    ret i1 %0
+}
+
+define i1 @isany_d(double %x) nounwind {
+; X86-SDAGISEL-LABEL: isany_d:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movb $1, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-LABEL: isany_d:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isany_d:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    movb $1, %al
+; X86-FASTISEL-NEXT:    retl
+entry:
+    %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023)
+    ret i1 %0
+}
+
+define i1 @isnone_f80(x86_fp80 %x) nounwind {
+; X86-SDAGISEL-LABEL: isnone_f80:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    xorl %eax, %eax
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-SDAGISEL-LABEL: isnone_f80:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    xorl %eax, %eax
+; X64-SDAGISEL-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isnone_f80:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    xorl %eax, %eax
+; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isnone_f80:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-FASTISEL-NEXT:    fstp %st(0)
+; X64-FASTISEL-NEXT:    xorl %eax, %eax
+; X64-FASTISEL-NEXT:    retq
+entry:
+%0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 0)
+ret i1 %0
+}
+
+define i1 @isany_f80(x86_fp80 %x) nounwind {
+; X86-SDAGISEL-LABEL: isany_f80:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movb $1, %al
+; X86-SDAGISEL-NEXT:    retl
+;
+; X64-SDAGISEL-LABEL: isany_f80:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movb $1, %al
+; X64-SDAGISEL-NEXT:    retq
+;
+; X86-FASTISEL-LABEL: isany_f80:
+; X86-FASTISEL:       # %bb.0: # %entry
+; X86-FASTISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-FASTISEL-NEXT:    fstp %st(0)
+; X86-FASTISEL-NEXT:    movb $1, %al
+; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isany_f80:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-FASTISEL-NEXT:    fstp %st(0)
+; X64-FASTISEL-NEXT:    movb $1, %al
+; X64-FASTISEL-NEXT:    retq
+entry:
+    %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 1023)
+    ret i1 %0
+}
diff --git a/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
new file mode 100644
index 0000000..e272e7e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
+#
+# Check that only the computed gotos are duplicated aggressively.
+#
+--- |
+  @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
+  declare i64 @f0()
+  declare i64 @f1()
+  declare i64 @f2()
+  declare i64 @f3()
+  declare i64 @f4()
+  declare i64 @f5()
+  define void @computed_goto() {
+    start:
+      ret void
+    bb1:
+      ret void
+    bb2:
+      ret void
+    bb3:
+      ret void
+    bb4:
+      ret void
+  }
+  define void @jump_table() { ret void }
+  define void @jump_table_pic() { ret void }
+...
+---
+name:            computed_goto
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: computed_goto
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  bb.0:
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %0:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %0
+    JMP_1 %bb.5
+
+  bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %1:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %1
+    JMP_1 %bb.5
+
+  bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %2:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %2
+    JMP_1 %bb.5
+
+  bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %3:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %3
+    JMP_1 %bb.5
+
+  bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %4:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %4
+
+  bb.5:
+    successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+
+    %5:gr64_nosp = COPY %6
+    JMP64m $noreg, 8, %5, @computed_goto.dispatch, $noreg
+...
diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll
index 1dc454dd..3330403 100644
--- a/llvm/test/CodeGen/X86/swap.ll
+++ b/llvm/test/CodeGen/X86/swap.ll
@@ -113,21 +113,17 @@ define dso_local void @onealloc_readback_1(ptr nocapture %a, ptr nocapture %b) l
 ;
 ; AA-LABEL: onealloc_readback_1:
 ; AA:       # %bb.0: # %entry
-; AA-NEXT:    vmovups (%rdi), %xmm0
-; AA-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AA-NEXT:    vmovups (%rsi), %xmm0
 ; AA-NEXT:    vmovups %xmm0, (%rdi)
 ; AA-NEXT:    retq
 entry:
   %alloc = alloca [16 x i8], i8 2, align 1
   %part1 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part1)
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc)
+  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part1, ptr align 1 %a, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %b, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part1)
   tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %a, ptr align 1 %alloc, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc)
+  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc)
   ret void
 }
 
@@ -144,19 +140,16 @@ define dso_local void @onealloc_readback_2(ptr nocapture %a, ptr nocapture %b) l
 ; AA-LABEL: onealloc_readback_2:
 ; AA:       # %bb.0: # %entry
 ; AA-NEXT:    vmovups (%rsi), %xmm0
-; AA-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AA-NEXT:    vmovups %xmm0, (%rdi)
 ; AA-NEXT:    retq
 entry:
   %alloc = alloca [16 x i8], i8 2, align 1
   %part2 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc)
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %a, i64 16, i1 false)
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part2, ptr align 1 %b, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc)
   tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %a, ptr align 1 %part2, i64 16, i1 false)
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part2)
+  call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/XCore/section-name.ll b/llvm/test/CodeGen/XCore/section-name.ll
index 0fa2cc6..b2176ec 100644
--- a/llvm/test/CodeGen/XCore/section-name.ll
+++ b/llvm/test/CodeGen/XCore/section-name.ll
@@ -1,4 +1,4 @@
-; RUN: not llc < %s -mtriple=xcore -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=xcore | FileCheck %s
 
 @bar = internal global i32 zeroinitializer
 
@@ -6,4 +6,4 @@ define void @".dp.bss"() {
   ret void
 }
 
-; CHECK: <unknown>:0: error: symbol '.dp.bss' is already defined
+; CHECK: .dp.bss:
diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
index 8a9052c..fa42481 100644
--- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
+++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
@@ -6,16 +6,12 @@
 ; CHECK: .visible .func use_dbg_declare()
 ; CHECK: .local .align 8 .b8 __local_depot0[8];
 ; CHECK: mov.b64 %SPL, __local_depot0;
-; CHECK: add.u64 %rd1, %SP, 0;
 ; CHECK: .loc 1 5 3                   // t.c:5:3
 ; CHECK: { // callseq 0, 0
 ; CHECK: .param .b64 param0;
+; CHECK: add.u64 %rd1, %SP, 0;
 ; CHECK: st.param.b64 [param0], %rd1;
-; CHECK: call.uni
-; CHECK: escape_foo,
-; CHECK: (
-; CHECK: param0
-; CHECK: );
+; CHECK: call.uni escape_foo, (param0);
 ; CHECK: } // callseq 0
 ; CHECK: .loc 1 6 1                   // t.c:6:1
 ; CHECK: ret;
diff --git a/llvm/test/DebugInfo/X86/branch-folder-dbg-after-end.mir b/llvm/test/DebugInfo/X86/branch-folder-dbg-after-end.mir
new file mode 100644
index 0000000..743851c
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/branch-folder-dbg-after-end.mir
@@ -0,0 +1,108 @@
+# RUN: llc %s --start-before=branch-folder --stop-after=branch-folder -o - \
+# RUN: | FileCheck %s --implicit-check-not=DBG_PHI
+
+## Common instructions are hoisted. Check that trailing debug instructions in
+## the range are also hoisted, and don't cause a crash.
+##
+## Note the MIR doesn't match the IR as it's modified from:
+## /home/och/dev/llvm-project/llvm/test/DebugInfo/X86/branch-folder-dbg.mir
+
+# CHECK: bb.0
+# CHECK:      CALL64pcrel32 @f, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+## --- Start splice from bb.2.if.else (and debug instructions from bb.1.if.then) ---
+# CHECK-NEXT: $edi = MOV32r0 implicit-def dead $eflags, debug-location !DILocation(line: 0, scope: ![[#]])
+# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(),  debug-location
+# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(),  debug-location
+## --- End splice ------------------------------------------------------------------
+# CHECK-NEXT: TEST64rr killed renamable $rax, renamable $rax, implicit-def $eflags
+# CHECK-NEXT: JCC_1 %bb.2, 8, implicit $eflags
+# CHECK: bb.1
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  declare dso_local noundef i64 @f() local_unnamed_addr
+
+  define dso_local noundef i32 @g() local_unnamed_addr !dbg !7 {
+    %call = tail call noundef i64 @f()
+    %cmp1 = icmp sgt i64 0, %call
+    %conv2 = trunc i64 0 to i32
+    br i1 %cmp1, label %if.then, label %if.else
+
+  if.then:                                          ; preds = %0
+    tail call void @_Z3fooii(i32 noundef %conv2, i32 noundef 0), !dbg !14
+    br label %if.end, !dbg !15
+
+  if.else:                                          ; preds = %0
+    tail call void @_Z3barii(i32 noundef %conv2, i32 noundef 1), !dbg !16
+    br label %if.end, !dbg !17
+
+  if.end:                                           ; preds = %if.else, %if.then
+    ret i32 2
+  }
+
+  declare void @_Z3fooii(i32 noundef, i32 noundef) local_unnamed_addr
+
+  declare void @_Z3barii(i32 noundef, i32 noundef) local_unnamed_addr
+
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+  !llvm.dbg.cu = !{!3}
+  !llvm.debugify = !{!5, !6}
+
+  !0 = !{i32 7, !"Dwarf Version", i32 5}
+  !1 = !{i32 2, !"Debug Info Version", i32 3}
+  !2 = !{!"clang version 21.0.0"}
+  !3 = distinct !DICompileUnit(language: DW_LANG_C, file: !4, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+  !4 = !DIFile(filename: "test.nodbg.ll", directory: "/")
+  !5 = !{i32 15}
+  !6 = !{i32 7}
+  !7 = distinct !DISubprogram(name: "g", linkageName: "g", scope: null, file: !4, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, retainedNodes: !10)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{}
+  !10 = !{!11}
+  !11 = !DILocalVariable(name: "1", scope: !7, file: !4, line: 3, type: !12)
+  !12 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
+  !13 = !DILocation(line: 3, column: 1, scope: !7)
+  !14 = !DILocation(line: 9, column: 1, scope: !7)
+  !15 = !DILocation(line: 10, column: 1, scope: !7)
+  !16 = !DILocation(line: 11, column: 1, scope: !7)
+  !17 = !DILocation(line: 12, column: 1, scope: !7)
+...
+---
+name:            g
+tracksRegLiveness: true
+isSSA: false
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    CALL64pcrel32 @f, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    TEST64rr killed renamable $rax, renamable $rax, implicit-def $eflags
+    JCC_1 %bb.2, 9, implicit killed $eflags
+    JMP_1 %bb.1
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+
+    $edi = MOV32r0 implicit-def dead $eflags, debug-location !14
+    DBG_VALUE $edi, $noreg, !11, !DIExpression(),  debug-location !13
+
+  bb.3.if.end:
+    $eax = MOV32ri 2
+    $rcx = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 8
+    RET 0, $eax
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+
+    $edi = MOV32r0 implicit-def dead $eflags, debug-location !16
+    DBG_VALUE $edi, $noreg, !11, !DIExpression(),  debug-location !13
+    CALL64pcrel32 target-flags(x86-plt) @_Z3barii, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit killed $edi, implicit-def $rsp, implicit-def $ssp,  debug-location !16
+    JMP_1 %bb.3,  debug-location !15
+
+...
diff --git a/llvm/test/DebugInfo/X86/branch-folder-dbg.mir b/llvm/test/DebugInfo/X86/branch-folder-dbg.mir
index 5c38fd2..11b37218 100644
--- a/llvm/test/DebugInfo/X86/branch-folder-dbg.mir
+++ b/llvm/test/DebugInfo/X86/branch-folder-dbg.mir
@@ -1,16 +1,28 @@
-# RUN: llc %s --start-before=branch-folder --stop-after=branch-folder -o - | FileCheck %s
+# RUN: llc %s --start-before=branch-folder --stop-after=branch-folder -o - \
+# RUN: | FileCheck %s --implicit-check-not=DBG_PHI
 
 ## Check that common instructions hoisted from `if.then` and `if.else` into
-## common pred `entry` get merged debug locations.
-
-## FIXME: The debug instructions handling here is wrong.
+## common pred `entry` get merged debug locations. The debug instructions from
+## both branches should get hoisted and killed.
+##
+## The MIR debug instructions have been modified by hand in order to check they
+## can be killed.
+##
+## Check DBG_PHIs are deleted rather than hoisted (implicit-check-not).
+##
+## Check DBG_LABELs are hoisted and not modified (and don't cause a crash).
 
 # CHECK: bb.0
 # CHECK:      CALL64pcrel32 @f, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
-## --- Start splice from bb.2.if.else ---
-# CHECK-NEXT: DBG_VALUE 2, $noreg, ![[#]], !DIExpression(),  debug-location ![[#]]
-# CHECK-NEXT: $edi = MOV32r0 implicit-def dead $eflags,  debug-location !DILocation(line: 0, scope: ![[#]])
-## --- End splice --------------
+## --- Start splice from bb.2.if.else (and debug instructions from bb.1.if.then) ---
+# CHECK-NEXT: DBG_LABEL 0
+# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(),  debug-location ![[#]]
+# CHECK-NEXT: DBG_LABEL 1
+# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(),  debug-location ![[#]]
+# CHECK-NEXT: $edi = MOV32r0 implicit-def dead $eflags, debug-instr-number 2, debug-location !DILocation(line: 0, scope: ![[#]])
+# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(DW_OP_LLVM_arg, 0),  debug-location ![[#]]
+# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(DW_OP_LLVM_arg, 0),  debug-location ![[#]]
+## --- End splice ------------------------------------------------------------------
 # CHECK-NEXT: TEST64rr killed renamable $rax, renamable $rax, implicit-def $eflags
 # CHECK-NEXT: JCC_1 %bb.2, 9, implicit killed $eflags
 # CHECK: bb.1
@@ -73,6 +85,8 @@
 ...
 ---
 name:            g
+tracksRegLiveness: true
+isSSA: false
 body:             |
   bb.0 (%ir-block.0):
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -87,21 +101,25 @@ body:             |
   bb.1.if.then:
     successors: %bb.3(0x80000000)
 
-    DBG_VALUE 0, $noreg, !11, !DIExpression(),  debug-location !13
-    $edi = MOV32r0 implicit-def dead $eflags,  debug-location !14
+    DBG_PHI $esp, 3
+    DBG_LABEL 0
+    DBG_VALUE $esi, $noreg, !11, !DIExpression(),  debug-location !13
+    $edi = MOV32r0 implicit-def dead $eflags, debug-instr-number 1, debug-location !14
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),  debug-location !13
     $esi = MOV32r0 implicit-def dead $eflags,  debug-location !14
     CALL64pcrel32 target-flags(x86-plt) @_Z3fooii, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit killed $esi, implicit-def $rsp, implicit-def $ssp,  debug-location !14
-    DBG_VALUE 1, $noreg, !11, !DIExpression(),  debug-location !13
     JMP_1 %bb.3,  debug-location !15
 
   bb.2.if.else:
     successors: %bb.3(0x80000000)
 
-    DBG_VALUE 2, $noreg, !11, !DIExpression(),  debug-location !13
-    $edi = MOV32r0 implicit-def dead $eflags,  debug-location !16
+    DBG_PHI $esp, 4
+    DBG_LABEL 1
+    DBG_VALUE $esp, $noreg, !11, !DIExpression(), debug-location !13
+    $edi = MOV32r0 implicit-def dead $eflags, debug-instr-number 2, debug-location !16
+    DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0),  debug-location !13
     $esi = MOV32ri 1,  debug-location !16
     CALL64pcrel32 target-flags(x86-plt) @_Z3barii, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit killed $esi, implicit-def $rsp, implicit-def $ssp,  debug-location !16
-    DBG_VALUE 3, $noreg, !11, !DIExpression(),  debug-location !13
 
   bb.3.if.end:
     $eax = MOV32ri 2
diff --git a/llvm/test/DebugInfo/X86/stringpool.ll b/llvm/test/DebugInfo/X86/stringpool.ll
index 219e672..1c3a57a 100644
--- a/llvm/test/DebugInfo/X86/stringpool.ll
+++ b/llvm/test/DebugInfo/X86/stringpool.ll
@@ -28,8 +28,12 @@ source_filename = "test/DebugInfo/X86/stringpool.ll"
 ; LINUX: .section .debug_str,"MS",@progbits,1
 ; LINUX: yyyy
 
+; DARWIN:      .section __DWARF,__debug_abbrev,regular,debug
+; DARWIN-NEXT: Lsection_abbrev:
+
 ; Verify that we refer to 'yyyy' with a direct offset.
 ; DARWIN: .section        __DWARF,__debug_info,regular,debug
+; DARWIN: Lset1 = Lsection_abbrev-Lsection_abbrev
 ; DARWIN: DW_TAG_variable
 ; DARWIN:             .long   [[YYYY:[0-9]+]]
 ; DARWIN-NEXT:        .long   {{[0-9]+}}              ## DW_AT_type
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/exception-lifetime.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/exception-lifetime.ll
index dda9a70..ac5d8b8 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/exception-lifetime.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/exception-lifetime.ll
@@ -57,7 +57,7 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[X]], i32 15
 ; CHECK-NEXT:    store i8 [[TMP20]], ptr [[TMP26]], align 1
 ; CHECK-NEXT:    invoke void @mayFail(ptr [[X_HWASAN]])
-; CHECK-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+; CHECK-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
 ; CHECK:       invoke.cont:
 ; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8
 ; CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[X]] to i64
@@ -69,12 +69,10 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    ret void
 ; CHECK:       lpad:
 ; CHECK-NEXT:    [[TMP32:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    cleanup
+; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractvalue { ptr, i32 } [[TMP32]], 0
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP14]], ptr [[EXN_SLOT]], i32 19)
 ; CHECK-NEXT:    store ptr [[TMP33]], ptr [[EXN_SLOT]], align 8
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractvalue { ptr, i32 } [[TMP32]], 1
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP14]], ptr [[EHSELECTOR_SLOT]], i32 18)
 ; CHECK-NEXT:    store i32 [[TMP34]], ptr [[EHSELECTOR_SLOT]], align 4
 ; CHECK-NEXT:    call void @onExcept(ptr [[X_HWASAN]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8
@@ -86,9 +84,7 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[X]])
 ; CHECK-NEXT:    br label [[EH_RESUME:%.*]]
 ; CHECK:       eh.resume:
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP14]], ptr [[EXN_SLOT]], i32 3)
 ; CHECK-NEXT:    [[EXN:%.*]] = load ptr, ptr [[EXN_SLOT]], align 8
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess.shortgranules(ptr [[TMP14]], ptr [[EHSELECTOR_SLOT]], i32 2)
 ; CHECK-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
 ; CHECK-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { ptr, i32 } undef, ptr [[EXN]], 0
 ; CHECK-NEXT:    [[LPAD_VAL1:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1
@@ -98,12 +94,12 @@ entry:
   %x = alloca i32, align 8
   %exn.slot = alloca ptr, align 8
   %ehselector.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 8, ptr %x)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %x)
   invoke void @mayFail(ptr %x) to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
 
-  call void @llvm.lifetime.end.p0(i64 8, ptr %x)
+  call void @llvm.lifetime.end.p0(i64 4, ptr %x)
   ret void
 
 lpad:                                             ; preds = %entry
@@ -115,7 +111,7 @@ lpad:                                             ; preds = %entry
   %2 = extractvalue { ptr, i32 } %0, 1
   store i32 %2, ptr %ehselector.slot, align 4
   call void @onExcept(ptr %x) #18
-  call void @llvm.lifetime.end.p0(i64 8, ptr %x)
+  call void @llvm.lifetime.end.p0(i64 4, ptr %x)
   br label %eh.resume
 
 eh.resume:                                        ; preds = %lpad
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll b/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll
index 50ce490..3e13eb4 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/exception-lifetime.ll
@@ -22,7 +22,7 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 48
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = ashr i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.read_register.i64(metadata [[META2:![0-9]+]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 44
@@ -56,7 +56,7 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP25]]
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP26]], i8 [[TMP22]], i64 1, i1 false)
 ; CHECK-NEXT:    invoke void @mayFail(ptr [[X_HWASAN]])
-; CHECK-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+; CHECK-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
 ; CHECK:       invoke.cont:
 ; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8
 ; CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[X]] to i64
@@ -68,14 +68,12 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    ret void
 ; CHECK:       lpad:
 ; CHECK-NEXT:    [[TMP32:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    cleanup
-; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.read_register.i64(metadata [[META2:![0-9]+]])
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.read_register.i64(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    call void @__hwasan_handle_vfork(i64 [[TMP33]])
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractvalue { ptr, i32 } [[TMP32]], 0
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess(ptr [[TMP16]], ptr [[EXN_SLOT]], i32 19)
 ; CHECK-NEXT:    store ptr [[TMP34]], ptr [[EXN_SLOT]], align 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractvalue { ptr, i32 } [[TMP32]], 1
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess(ptr [[TMP16]], ptr [[EHSELECTOR_SLOT]], i32 18)
 ; CHECK-NEXT:    store i32 [[TMP35]], ptr [[EHSELECTOR_SLOT]], align 4
 ; CHECK-NEXT:    call void @onExcept(ptr [[X_HWASAN]])
 ; CHECK-NEXT:    [[TMP36:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8
@@ -87,9 +85,7 @@ define void @test() sanitize_hwaddress personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[X]])
 ; CHECK-NEXT:    br label [[EH_RESUME:%.*]]
 ; CHECK:       eh.resume:
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess(ptr [[TMP16]], ptr [[EXN_SLOT]], i32 3)
 ; CHECK-NEXT:    [[EXN:%.*]] = load ptr, ptr [[EXN_SLOT]], align 8
-; CHECK-NEXT:    call void @llvm.hwasan.check.memaccess(ptr [[TMP16]], ptr [[EHSELECTOR_SLOT]], i32 2)
 ; CHECK-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
 ; CHECK-NEXT:    [[LPAD_VAL:%.*]] = insertvalue { ptr, i32 } undef, ptr [[EXN]], 0
 ; CHECK-NEXT:    [[LPAD_VAL1:%.*]] = insertvalue { ptr, i32 } [[LPAD_VAL]], i32 [[SEL]], 1
@@ -99,12 +95,12 @@ entry:
   %x = alloca i32, align 8
   %exn.slot = alloca ptr, align 8
   %ehselector.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 8, ptr %x)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %x)
   invoke void @mayFail(ptr %x) to label %invoke.cont unwind label %lpad
 
 invoke.cont:                                      ; preds = %entry
 
-  call void @llvm.lifetime.end.p0(i64 8, ptr %x)
+  call void @llvm.lifetime.end.p0(i64 4, ptr %x)
   ret void
 
 lpad:                                             ; preds = %entry
@@ -116,7 +112,7 @@ lpad:                                             ; preds = %entry
   %2 = extractvalue { ptr, i32 } %0, 1
   store i32 %2, ptr %ehselector.slot, align 4
   call void @onExcept(ptr %x) #18
-  call void @llvm.lifetime.end.p0(i64 8, ptr %x)
+  call void @llvm.lifetime.end.p0(i64 4, ptr %x)
   br label %eh.resume
 
 eh.resume:                                        ; preds = %lpad
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
index b9eb2d2..c5288a7 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s
@@ -249,6 +249,250 @@ flat_load_monitor_b32 v1, v[2:3] offset:64
 // GFX1250: flat_load_monitor_b32 v1, v[2:3] offset:64 ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+flat_load_monitor_b32 v1, v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: flat_load_monitor_b32 v1, v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b64 v[0:1], v[2:3]
+// GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b64 v[0:1], v[2:3] offset:64
+// GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b64 v[0:1], v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b128 v[0:3], v[4:5]
+// GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b128 v[0:3], v[4:5] offset:64
+// GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] offset:64 ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b128 v[0:3], v[4:5] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x3c,0x00,0x04,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset
+// GFX1250: flat_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x00,0x1c,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset
+// GFX1250: flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b8 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b8 v[2:3], v1, off offset:64
+// GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b8 v[2:3], v1, off offset:-64
+// GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b8 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b8 v2, v1, s[2:3] offset:64
+// GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b8 v2, v1, s[2:3] offset:-64
+// GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b32 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b32 v[2:3], v1, off offset:64
+// GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b32 v[2:3], v1, off offset:-64
+// GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b32 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b32 v2, v1, s[2:3] offset:64
+// GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b32 v2, v1, s[2:3] offset:-64
+// GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b64 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b64 v[2:3], v1, off offset:64
+// GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b64 v[2:3], v1, off offset:-64
+// GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b64 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b64 v2, v1, s[2:3] offset:64
+// GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b64 v2, v1, s[2:3] offset:-64
+// GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b128 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b128 v[2:3], v1, off offset:64
+// GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b128 v[2:3], v1, off offset:-64
+// GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b128 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b128 v2, v1, s[2:3] offset:64
+// GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b128 v2, v1, s[2:3] offset:-64
+// GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b32 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250: global_store_async_from_lds_b32 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250: global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b8 v1, v[2:3], off offset:64
+// GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b8 v1, v[2:3], off offset:-64
+// GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b8 v1, v2, s[2:3] offset:64
+// GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64
+// GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b32 v1, v[2:3], off offset:64
+// GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b32 v1, v[2:3], off offset:-64
+// GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b32 v1, v2, s[2:3] offset:64
+// GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64
+// GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b64 v1, v[2:3], off offset:64
+// GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b64 v1, v[2:3], off offset:-64
+// GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b64 v1, v2, s[2:3] offset:64
+// GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64
+// GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b128 v1, v[2:3], off offset:64
+// GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b128 v1, v[2:3], off offset:-64
+// GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b128 v1, v2, s[2:3] offset:64
+// GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64
+// GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b32 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_async_to_lds_b32 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_async_to_lds_b64 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250: global_load_async_to_lds_b64 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 tensor_save s[0:1]
 // GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
index 26d7ed3..c9fe702 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s
@@ -57,3 +57,51 @@ scratch_load_b32 v5, off, off offset:32 scale_offset
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction
 // GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, off offset:32 scale_offset
 // GFX1250-ERR-NEXT:{{^}}                                        ^
+
+global_store_async_from_lds_b8 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_store_async_from_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_store_async_from_lds_b32 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_store_async_from_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_store_async_from_lds_b64 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_store_async_from_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_store_async_from_lds_b128 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_store_async_from_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions
+
+global_load_async_to_lds_b8 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
+
+global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
+
+global_load_async_to_lds_b32 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
+
+global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
+
+global_load_async_to_lds_b64 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
+
+global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
+
+global_load_async_to_lds_b128 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
+
+global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV
+// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
index de7895f..291192b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt
@@ -3177,6 +3177,162 @@
 # GFX1250: global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
 0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
 
+# GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x80,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+0x02,0x80,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+0x02,0x00,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+0x02,0x40,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x17,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
+0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
+0x02,0xc0,0x17,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b32 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00]
+0x04,0x00,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00
+
+# GFX1250: global_load_async_to_lds_b64 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00]
+0x04,0x40,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x80,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+0x02,0x80,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x00,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+0x02,0x00,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+0x7c,0x40,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+0x02,0x40,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00]
+0x7c,0xc0,0x18,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00]
+0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff]
+0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff
+
+# GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00]
+0x02,0xc0,0x18,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b32 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00]
+0x04,0x00,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00
+
+# GFX1250: global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00]
+0x04,0x40,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00
+
 # GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test
index fdca11b..369005f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test
+++ b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test
@@ -13,10 +13,10 @@
 # RES_4_2: ; error decoding test.kd: kernel descriptor reserved bits in range (511:480) set
 # RES_4_2-NEXT: ; decoding failed region as bytes
 
-# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000006000000000000 \
-# RUN:   | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_457
-# RES_457: ; error decoding test.kd: kernel descriptor reserved bits in range (457:455) set
-# RES_457-NEXT: ; decoding failed region as bytes
+# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003000000000000 \
+# RUN:   | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_456
+# RES_456: ; error decoding test.kd: kernel descriptor reserved bits in range (456:455) set
+# RES_456-NEXT: ; decoding failed region as bytes
 
 # RUN: yaml2obj %s -DGPU=GFX90A -DKD=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000c000000000000 \
 # RUN:   | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=WF32
diff --git a/llvm/test/MC/Disassembler/RISCV/c_slli.txt b/llvm/test/MC/Disassembler/RISCV/c_slli.txt
index 2520455..d8d65ef 100644
--- a/llvm/test/MC/Disassembler/RISCV/c_slli.txt
+++ b/llvm/test/MC/Disassembler/RISCV/c_slli.txt
@@ -14,7 +14,7 @@
 # RUN:     -M no-aliases --show-encoding < %s 2>&1 | \
 # RUN:   FileCheck --check-prefix=NOHINTS %s
 
-# GOOD: c.slli64 zero
+# GOOD: c.slli zero, 0
 # NOHINTS: invalid instruction encoding
 0x02 0x00
 
@@ -302,7 +302,7 @@
 # NOHINTS: invalid instruction encoding
 0x7E 0x10
 
-# GOOD: c.slli64 ra
+# GOOD: c.slli ra, 0
 0x82 0x00
 
 # GOOD: c.slli ra, 1
@@ -526,7 +526,7 @@
 # GOOD64: c.slli ra, 63
 0xFE 0x10
 
-# GOOD: c.slli64 sp
+# GOOD: c.slli sp, 0
 0x02 0x01
 
 # GOOD: c.slli sp, 1
@@ -750,7 +750,7 @@
 # GOOD64: c.slli sp, 63
 0x7E 0x11
 
-# GOOD: c.slli64 gp
+# GOOD: c.slli gp, 0
 0x82 0x01
 
 # GOOD: c.slli gp, 1
@@ -974,7 +974,7 @@
 # GOOD64: c.slli gp, 63
 0xFE 0x11
 
-# GOOD: c.slli64 tp
+# GOOD: c.slli tp, 0
 0x02 0x02
 
 # GOOD: c.slli tp, 1
@@ -1198,7 +1198,7 @@
 # GOOD64: c.slli tp, 63
 0x7E 0x12
 
-# GOOD: c.slli64 t0
+# GOOD: c.slli t0, 0
 0x82 0x02
 
 # GOOD: c.slli t0, 1
@@ -1422,7 +1422,7 @@
 # GOOD64: c.slli t0, 63
 0xFE 0x12
 
-# GOOD: c.slli64 t1
+# GOOD: c.slli t1, 0
 0x02 0x03
 
 # GOOD: c.slli t1, 1
@@ -1646,7 +1646,7 @@
 # GOOD64: c.slli t1, 63
 0x7E 0x13
 
-# GOOD: c.slli64 t2
+# GOOD: c.slli t2, 0
 0x82 0x03
 
 # GOOD: c.slli t2, 1
@@ -1870,7 +1870,7 @@
 # GOOD64: c.slli t2, 63
 0xFE 0x13
 
-# GOOD: c.slli64 s0
+# GOOD: c.slli s0, 0
 0x02 0x04
 
 # GOOD: c.slli s0, 1
@@ -2094,7 +2094,7 @@
 # GOOD64: c.slli s0, 63
 0x7E 0x14
 
-# GOOD: c.slli64 s1
+# GOOD: c.slli s1, 0
 0x82 0x04
 
 # GOOD: c.slli s1, 1
@@ -2318,7 +2318,7 @@
 # GOOD64: c.slli s1, 63
 0xFE 0x14
 
-# GOOD: c.slli64 a0
+# GOOD: c.slli a0, 0
 0x02 0x05
 
 # GOOD: c.slli a0, 1
@@ -2542,7 +2542,7 @@
 # GOOD64: c.slli a0, 63
 0x7E 0x15
 
-# GOOD: c.slli64 a1
+# GOOD: c.slli a1, 0
 0x82 0x05
 
 # GOOD: c.slli a1, 1
@@ -2766,7 +2766,7 @@
 # GOOD64: c.slli a1, 63
 0xFE 0x15
 
-# GOOD: c.slli64 a2
+# GOOD: c.slli a2, 0
 0x02 0x06
 
 # GOOD: c.slli a2, 1
@@ -2990,7 +2990,7 @@
 # GOOD64: c.slli a2, 63
 0x7E 0x16
 
-# GOOD: c.slli64 a3
+# GOOD: c.slli a3, 0
 0x82 0x06
 
 # GOOD: c.slli a3, 1
@@ -3214,7 +3214,7 @@
 # GOOD64: c.slli a3, 63
 0xFE 0x16
 
-# GOOD: c.slli64 a4
+# GOOD: c.slli a4, 0
 0x02 0x07
 
 # GOOD: c.slli a4, 1
@@ -3438,7 +3438,7 @@
 # GOOD64: c.slli a4, 63
 0x7E 0x17
 
-# GOOD: c.slli64 a5
+# GOOD: c.slli a5, 0
 0x82 0x07
 
 # GOOD: c.slli a5, 1
@@ -3662,7 +3662,7 @@
 # GOOD64: c.slli a5, 63
 0xFE 0x17
 
-# GOOD: c.slli64 a6
+# GOOD: c.slli a6, 0
 0x02 0x08
 
 # GOOD: c.slli a6, 1
@@ -3886,7 +3886,7 @@
 # GOOD64: c.slli a6, 63
 0x7E 0x18
 
-# GOOD: c.slli64 a7
+# GOOD: c.slli a7, 0
 0x82 0x08
 
 # GOOD: c.slli a7, 1
@@ -4110,7 +4110,7 @@
 # GOOD64: c.slli a7, 63
 0xFE 0x18
 
-# GOOD: c.slli64 s2
+# GOOD: c.slli s2, 0
 0x02 0x09
 
 # GOOD: c.slli s2, 1
@@ -4334,7 +4334,7 @@
 # GOOD64: c.slli s2, 63
 0x7E 0x19
 
-# GOOD: c.slli64 s3
+# GOOD: c.slli s3, 0
 0x82 0x09
 
 # GOOD: c.slli s3, 1
@@ -4558,7 +4558,7 @@
 # GOOD64: c.slli s3, 63
 0xFE 0x19
 
-# GOOD: c.slli64 s4
+# GOOD: c.slli s4, 0
 0x02 0x0A
 
 # GOOD: c.slli s4, 1
@@ -4782,7 +4782,7 @@
 # GOOD64: c.slli s4, 63
 0x7E 0x1A
 
-# GOOD: c.slli64 s5
+# GOOD: c.slli s5, 0
 0x82 0x0A
 
 # GOOD: c.slli s5, 1
@@ -5006,7 +5006,7 @@
 # GOOD64: c.slli s5, 63
 0xFE 0x1A
 
-# GOOD: c.slli64 s6
+# GOOD: c.slli s6, 0
 0x02 0x0B
 
 # GOOD: c.slli s6, 1
@@ -5230,7 +5230,7 @@
 # GOOD64: c.slli s6, 63
 0x7E 0x1B
 
-# GOOD: c.slli64 s7
+# GOOD: c.slli s7, 0
 0x82 0x0B
 
 # GOOD: c.slli s7, 1
@@ -5454,7 +5454,7 @@
 # GOOD64: c.slli s7, 63
 0xFE 0x1B
 
-# GOOD: c.slli64 s8
+# GOOD: c.slli s8, 0
 0x02 0x0C
 
 # GOOD: c.slli s8, 1
@@ -5678,7 +5678,7 @@
 # GOOD64: c.slli s8, 63
 0x7E 0x1C
 
-# GOOD: c.slli64 s9
+# GOOD: c.slli s9, 0
 0x82 0x0C
 
 # GOOD: c.slli s9, 1
@@ -5902,7 +5902,7 @@
 # GOOD64: c.slli s9, 63
 0xFE 0x1C
 
-# GOOD: c.slli64 s10
+# GOOD: c.slli s10, 0
 0x02 0x0D
 
 # GOOD: c.slli s10, 1
@@ -6126,7 +6126,7 @@
 # GOOD64: c.slli s10, 63
 0x7E 0x1D
 
-# GOOD: c.slli64 s11
+# GOOD: c.slli s11, 0
 0x82 0x0D
 
 # GOOD: c.slli s11, 1
@@ -6350,7 +6350,7 @@
 # GOOD64: c.slli s11, 63
 0xFE 0x1D
 
-# GOOD: c.slli64 t3
+# GOOD: c.slli t3, 0
 0x02 0x0E
 
 # GOOD: c.slli t3, 1
@@ -6574,7 +6574,7 @@
 # GOOD64: c.slli t3, 63
 0x7E 0x1E
 
-# GOOD: c.slli64 t4
+# GOOD: c.slli t4, 0
 0x82 0x0E
 
 # GOOD: c.slli t4, 1
@@ -6798,7 +6798,7 @@
 # GOOD64: c.slli t4, 63
 0xFE 0x1E
 
-# GOOD: c.slli64 t5
+# GOOD: c.slli t5, 0
 0x02 0x0F
 
 # GOOD: c.slli t5, 1
@@ -7022,7 +7022,7 @@
 # GOOD64: c.slli t5, 63
 0x7E 0x1F
 
-# GOOD: c.slli64 t6
+# GOOD: c.slli t6, 0
 0x82 0x0F
 
 # GOOD: c.slli t6, 1
diff --git a/llvm/test/MC/ELF/many-instructions.s b/llvm/test/MC/ELF/many-instructions.s
new file mode 100644
index 0000000..cbdb2a7
--- /dev/null
+++ b/llvm/test/MC/ELF/many-instructions.s
@@ -0,0 +1,10 @@
+# REQUIRES: asserts
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o /dev/null -debug-only=mc-dump
+
+## Test that encodeInstruction may cause a new fragment to be created.
+# CHECK: 0 Data Size:16200
+# CHECK: 16200 Data Size:180
+
+.rept 16384/10
+movabsq $foo, %rax
+.endr
diff --git a/llvm/test/MC/ELF/section-sym2.s b/llvm/test/MC/ELF/section-sym2.s
index 167fc8c..fe2b904 100644
--- a/llvm/test/MC/ELF/section-sym2.s
+++ b/llvm/test/MC/ELF/section-sym2.s
@@ -1,27 +1,40 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64 %s -o %t
-# RUN: llvm-readelf -Srs %t | FileCheck %s
+# RUN: llvm-readelf -SrsX %t | FileCheck %s
 
 ## Test that we can forward reference a section.
 
 mov .rodata, %rsi
+mov data, %rsi
 mov .debug_info, %rsi
+mov .debug_abbrev, %rsi
 
 .section .rodata,"a"
+.pushsection data, 2; .long 2; .popsection
+.section data; .long 1
 .section .debug_info,"G",@progbits,11,comdat; .long x1
 .section .debug_info,"G",@progbits,22,comdat; .long x2
 .section .debug_info,"",@progbits; .long x0
 
+.text
+mov data, %rdi
+
+# CHECK:      Relocation section '.rela.text'
+# CHECK:      R_X86_64_32S {{.*}} data + 0
+# CHECK:      R_X86_64_32S {{.*}} data + 0
+
 # CHECK:      Relocation section '.rela.debug_info' at offset {{.*}} contains 1
 # CHECK:      Relocation section '.rela.debug_info' at offset {{.*}} contains 1
 # CHECK:      Relocation section '.rela.debug_info' at offset {{.*}} contains 1
 
-# CHECK:      Symbol table '.symtab' contains 8 entries:
-# CHECK-NEXT:    Num:    Value          Size Type    Bind   Vis       Ndx Name
+# CHECK:      Symbol table '.symtab' contains 10 entries:
+# CHECK-NEXT:    Num:
 # CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
-# CHECK-NEXT:  0000000000000000     0 SECTION LOCAL  DEFAULT     4 .rodata
-# CHECK-NEXT:  0000000000000000     0 SECTION LOCAL  DEFAULT    11 .debug_info
-# CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT     5 11
-# CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT     8 22
+# CHECK-NEXT:  0000000000000000     0 SECTION LOCAL  DEFAULT [[#]] (.rodata) .rodata
+# CHECK-NEXT:  0000000000000000     0 SECTION LOCAL  DEFAULT [[#]] (data) data
+# CHECK-NEXT:  0000000000000000     0 SECTION LOCAL  DEFAULT [[#]] (.debug_info) .debug_info
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT [[#]] (.group) 11
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  LOCAL  DEFAULT [[#]] (.group) 22
+# CHECK-NEXT:  0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND .debug_abbrev
 # CHECK-NEXT:  0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND x1
 # CHECK-NEXT:  0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND x2
 # CHECK-NEXT:  0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND x0
diff --git a/llvm/test/MC/ELF/undefined-debug.s b/llvm/test/MC/ELF/undefined-debug.s
deleted file mode 100644
index 95ead70..0000000
--- a/llvm/test/MC/ELF/undefined-debug.s
+++ /dev/null
@@ -1,5 +0,0 @@
-// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t 2>&1 | FileCheck %s
-// CHECK: error: Undefined section reference: .debug_pubnames
-
-.section .foo,"",@progbits
-  .long  .debug_pubnames
diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s
index e8f4b14..ddc0c7d 100644
--- a/llvm/test/MC/RISCV/Relocations/mc-dump.s
+++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s
@@ -9,10 +9,10 @@
 # CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00]
 # CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4023
 # CHECK-NEXT:  Symbol @0 $x
-# CHECK-NEXT:8 Align Size:0+4 []
+# CHECK-NEXT:8 Align LinkerRelaxable Size:0+4 []
 # CHECK-NEXT:  Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
 # CHECK-NEXT:  Fixup @0 Value:4 Kind:[[#]]
-# CHECK-NEXT:12 Align Size:4+4 [13,05,30,00]
+# CHECK-NEXT:12 Align LinkerRelaxable Size:4+4 [13,05,30,00]
 # CHECK-NEXT:  Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
 # CHECK-NEXT:  Fixup @4 Value:4 Kind:[[#]]
 # CHECK-NEXT:]
diff --git a/llvm/test/MC/RISCV/align.s b/llvm/test/MC/RISCV/align.s
index 1dab1c6..da3b1aa 100644
--- a/llvm/test/MC/RISCV/align.s
+++ b/llvm/test/MC/RISCV/align.s
@@ -154,3 +154,41 @@ data2:
 2:
 	bnez t1, 1b
 	bnez t1, 2b
+
+## .text3 with a call at the start
+# NORELAX-RELOC:    .rela.text3a
+# C-OR-ZCA-EXT-NORELAX-RELOC: .rela.text3a
+# RELAX-RELOC:      .rela.text3a {
+# RELAX-RELOC-NEXT:    0x0  R_RISCV_CALL_PLT foo 0x0
+# RELAX-RELOC-NEXT:    0x0  R_RISCV_RELAX - 0x0
+# RELAX-RELOC-NEXT:    0xC  R_RISCV_BRANCH .Ltmp[[#]] 0x0
+# RELAX-RELOC-NEXT:    0x10 R_RISCV_ALIGN - 0x4
+# RELAX-RELOC-NEXT:    0x14 R_RISCV_BRANCH .Ltmp[[#]] 0x0
+# RELAX-RELOC-NEXT: }
+.section .text3a, "ax"
+call foo
+bnez t1, 1f
+bnez t2, 2f
+1:
+.p2align 3
+2:
+bnez t1, 1b
+bnez t1, 2b
+
+## .text3 with a call at the end
+# RELAX-RELOC:      .rela.text3b {
+# RELAX-RELOC-NEXT:    0x4  R_RISCV_BRANCH .Ltmp[[#]] 0x0
+# RELAX-RELOC-NEXT:    0x8  R_RISCV_ALIGN - 0x4
+# RELAX-RELOC-NEXT:    0xC  R_RISCV_BRANCH .Ltmp[[#]] 0x0
+# RELAX-RELOC-NEXT:    0x14 R_RISCV_CALL_PLT foo 0x0
+# RELAX-RELOC-NEXT:    0x14 R_RISCV_RELAX - 0x0
+# RELAX-RELOC-NEXT: }
+.section .text3b, "ax"
+bnez t1, 1f
+bnez t2, 2f
+1:
+.p2align 3
+2:
+bnez t1, 1b
+bnez t1, 2b
+call foo
diff --git a/llvm/test/MC/RISCV/rv32c-invalid.s b/llvm/test/MC/RISCV/rv32c-invalid.s
index 413573a..5214747 100644
--- a/llvm/test/MC/RISCV/rv32c-invalid.s
+++ b/llvm/test/MC/RISCV/rv32c-invalid.s
@@ -26,8 +26,8 @@ c.jalr  zero # CHECK: :[[@LINE]]:9: error: register must be a GPR excluding zero
 c.mv  ra, x0 # CHECK: :[[@LINE]]:11: error: register must be a GPR excluding zero (x0)
 c.add  ra, ra, x0 # CHECK: :[[@LINE]]:16: error: invalid operand for instruction
 
-## GPRNoX0X2
-c.lui x2, 4 # CHECK: :[[@LINE]]:7: error: register must be a GPR excluding zero (x0) and sp (x2){{$}}
+## GPRNoX2
+c.lui x2, 4 # CHECK: :[[@LINE]]:7: error: register must be a GPR excluding sp (x2){{$}}
 
 ## SP
 c.addi4spn  a0, a0, 12 # CHECK: :[[@LINE]]:17: error: register must be sp (x2)
@@ -35,10 +35,9 @@ c.addi16sp  t0, 16 # CHECK: :[[@LINE]]:13: error: register must be sp (x2)
 
 # Out of range immediates
 
-## uimmlog2xlennonzero
-c.slli t0, 64 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 31]
-c.srli a0, 32 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 31]
-c.srai a0, 0  # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 31]
+## uimmlog2xlenn
+c.slli t0, 64 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [0, 31]
+c.srli a0, 32 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [0, 31]
 
 ## simm6
 c.li t0, 128 # CHECK: :[[@LINE]]:10: error: immediate must be an integer in the range [-32, 31]
@@ -49,13 +48,14 @@ c.andi a0, -33 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in th
 c.andi a0, foo # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
 c.andi a0, %lo(foo) # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
 c.andi a0, %hi(foo) # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
+c.addi t0, -33 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
+c.addi t0, 32 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
+c.addi t0, foo # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
+c.addi t0, %lo(foo) # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
+c.addi t0, %hi(foo) # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
 
 ## simm6nonzero
-c.addi t0, -33 # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
-c.addi t0, 32 # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
-c.addi t0, foo # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
-c.addi t0, %lo(foo) # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
-c.addi t0, %hi(foo) # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
+c.nop 32 # CHECK: :[[@LINE]]:7: error: immediate must be non-zero in the range [-32, 31]
 
 ## c_lui_imm
 c.lui t0, 0 # CHECK: :[[@LINE]]:11: error: immediate must be in [0xfffe0, 0xfffff] or [1, 31]
diff --git a/llvm/test/MC/RISCV/rv64c-invalid.s b/llvm/test/MC/RISCV/rv64c-invalid.s
index 9b0a324..404e02cf 100644
--- a/llvm/test/MC/RISCV/rv64c-invalid.s
+++ b/llvm/test/MC/RISCV/rv64c-invalid.s
@@ -13,10 +13,9 @@ c.ldsp  zero, 4(sp) # CHECK: :[[@LINE]]:9: error: register must be a GPR excludi
 
 # Out of range immediates
 
-## uimmlog2xlennonzero
-c.slli t0, 64 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 63]
-c.srli a0, -1 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 63]
-c.srai a0, 0  # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 63]
+## uimmlog2xlen
+c.slli t0, 64 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [0, 63]
+c.srli a0, -1 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [0, 63]
 
 ## simm6
 c.addiw t0, -33 # CHECK: :[[@LINE]]:13: error: immediate must be an integer in the range [-32, 31]
diff --git a/llvm/test/MC/RISCV/rvc-hints-invalid.s b/llvm/test/MC/RISCV/rvc-hints-invalid.s
index 2a7a6ad..540d2c6 100644
--- a/llvm/test/MC/RISCV/rvc-hints-invalid.s
+++ b/llvm/test/MC/RISCV/rvc-hints-invalid.s
@@ -1,11 +1,11 @@
 # RUN: not llvm-mc -triple=riscv32 -mattr=+c < %s 2>&1 \
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-RV32 %s
 # RUN: not llvm-mc -triple=riscv64 -mattr=+c < %s 2>&1 \
-# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-RV64 %s
+# RUN:     | FileCheck -check-prefixes=CHECK %s
 
 c.nop 0 # CHECK: :[[@LINE]]:7: error: immediate must be non-zero in the range [-32, 31]
 
-c.addi x0, 33 # CHECK: :[[@LINE]]:12: error: immediate must be non-zero in the range [-32, 31]
+c.addi x0, 33 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-32, 31]
 
 c.li x0, 42 # CHECK: :[[@LINE]]:10: error: immediate must be an integer in the range [-32, 31]
 
@@ -15,10 +15,7 @@ c.mv x0, x0 # CHECK: :[[@LINE]]:10: error: register must be a GPR excluding zero
 
 c.add x0, x0 # CHECK: :[[@LINE]]:11: error: register must be a GPR excluding zero (x0)
 
-c.slli x0, 0 # CHECK-RV32: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 31]
-c.slli x0, 32 # CHECK-RV32: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 31]
-
-c.slli x0, 0 # CHECK-RV64: :[[@LINE]]:12: error: immediate must be an integer in the range [1, 63]
+c.slli x0, 32 # CHECK-RV32: :[[@LINE]]:12: error: immediate must be an integer in the range [0, 31]
 
 c.srli64 x30 # CHECK: :[[@LINE]]:10: error: invalid operand for instruction
 
diff --git a/llvm/test/MC/RISCV/rvc-hints-valid.s b/llvm/test/MC/RISCV/rvc-hints-valid.s
index 5dc86d1..7fba864 100644
--- a/llvm/test/MC/RISCV/rvc-hints-valid.s
+++ b/llvm/test/MC/RISCV/rvc-hints-valid.s
@@ -48,18 +48,34 @@ c.add x0, a0
 # CHECK-ASM: encoding: [0x06,0x00]
 c.slli x0, 1
 
-# CHECK-ASM-AND-OBJ: c.slli64 zero
+# CHECK-ASM-AND-OBJ: c.slli zero, 0
 # CHECK-ASM: encoding: [0x02,0x00]
 c.slli64 x0
 
-# CHECK-ASM-AND-OBJ: c.slli64 a0
+# CHECK-ASM-AND-OBJ: c.slli zero, 0
+# CHECK-ASM: encoding: [0x02,0x00]
+c.slli x0, 0
+
+# CHECK-ASM-AND-OBJ: c.slli a0, 0
 # CHECK-ASM: encoding: [0x02,0x05]
 c.slli64 a0
 
-# CHECK-ASM-AND-OBJ: c.srli64 a1
+# CHECK-ASM-AND-OBJ: c.slli a0, 0
+# CHECK-ASM: encoding: [0x02,0x05]
+c.slli a0, 0
+
+# CHECK-ASM-AND-OBJ: c.srli a1, 0
 # CHECK-ASM: encoding: [0x81,0x81]
 c.srli64 a1
 
-# CHECK-ASM-AND-OBJ: c.srai64 a0
+# CHECK-ASM-AND-OBJ: c.srli a1, 0
+# CHECK-ASM: encoding: [0x81,0x81]
+c.srli a1, 0
+
+# CHECK-ASM-AND-OBJ: c.srai a0, 0
 # CHECK-ASM: encoding: [0x01,0x85]
 c.srai64 a0
+
+# CHECK-ASM-AND-OBJ: c.srai a0, 0
+# CHECK-ASM: encoding: [0x01,0x85]
+c.srai a0, 0
diff --git a/llvm/test/MC/RISCV/rve-invalid.s b/llvm/test/MC/RISCV/rve-invalid.s
index 0b1e896..92e0239a 100644
--- a/llvm/test/MC/RISCV/rve-invalid.s
+++ b/llvm/test/MC/RISCV/rve-invalid.s
@@ -115,7 +115,7 @@ auipc t6, 32
 # CHECK: :[[@LINE+1]]:8: error: register must be a GPR excluding zero (x0)
 c.addi x31, 0
 # CHECK-DIS: 9846 <unknown>
-# CHECK: :[[@LINE+1]]:7: error: register must be a GPR excluding zero (x0)
+# CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 c.add x16, x17
 # CHECK-DIS: 8046 <unknown>
 # CHECK: :[[@LINE+1]]:10: error: register must be a GPR excluding zero (x0)
diff --git a/llvm/test/MC/WebAssembly/reference-types.s b/llvm/test/MC/WebAssembly/reference-types.s
index 08aafb2..7a838fc 100644
--- a/llvm/test/MC/WebAssembly/reference-types.s
+++ b/llvm/test/MC/WebAssembly/reference-types.s
@@ -1,5 +1,5 @@
-# RUN: llvm-mc -show-encoding -triple=wasm32-unknown-unknown -mattr=+reference-types < %s | FileCheck %s
-# RUN: llvm-mc -show-encoding -triple=wasm64-unknown-unknown -mattr=+reference-types < %s | FileCheck %s
+# RUN: llvm-mc -show-encoding -triple=wasm32-unknown-unknown -mattr=+reference-types -mattr=+gc < %s | FileCheck %s
+# RUN: llvm-mc -show-encoding -triple=wasm64-unknown-unknown -mattr=+reference-types -mattr=+gc < %s | FileCheck %s
 
 # CHECK-LABEL:ref_is_null:
 # CHECK: ref.is_null     # encoding: [0xd1]
diff --git a/llvm/test/TableGen/CompressInstEmitter/suboperands.td b/llvm/test/TableGen/CompressInstEmitter/suboperands.td
index d83cc04..f4e43d5 100644
--- a/llvm/test/TableGen/CompressInstEmitter/suboperands.td
+++ b/llvm/test/TableGen/CompressInstEmitter/suboperands.td
@@ -115,7 +115,7 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(0).getReg()) &&
 // CHECK-NEXT: MI.getOperand(1).isReg() &&
 // CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
-// CHECK-NEXT: ArchValidateMCOperandForCompress(MI.getOperand(2), STI, 1)) {
+// CHECK-NEXT: ArchValidateMCOperandForCompress(MI.getOperand(2), STI, 1 /* simm6 */)) {
 // CHECK-NEXT: // small $dst, $addr
 // CHECK-NEXT: OutInst.setOpcode(Arch::SmallInst);
 // CHECK-NEXT: // Operand: dst
@@ -131,7 +131,7 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(0).getReg()) &&
 // CHECK-NEXT: MI.getOperand(1).isReg() &&
 // CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
-// CHECK-NEXT: ArchValidateMCOperandForCompress(MI.getOperand(2), STI, 1)) {
+// CHECK-NEXT: ArchValidateMCOperandForCompress(MI.getOperand(2), STI, 1 /* simm6 */)) {
 // CHECK-NEXT: // small $dst, $src, $imm
 // CHECK-NEXT: OutInst.setOpcode(Arch::SmallInst2);
 // CHECK-NEXT: // Operand: dst
@@ -148,7 +148,7 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(0).getReg()) &&
 // CHECK-NEXT: MI.getOperand(1).isReg() &&
 // CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
-// CHECK-NEXT: ArchValidateMCOperandForCompress(MI.getOperand(2), STI, 1)) {
+// CHECK-NEXT: ArchValidateMCOperandForCompress(MI.getOperand(2), STI, 1 /* simm6 */)) {
 // CHECK-NEXT: // small $dst, $addr
 // CHECK-NEXT: OutInst.setOpcode(Arch::SmallInst3);
 // CHECK-NEXT: // Operand: dst
@@ -161,16 +161,17 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT: } // if
 
 // CHECK-LABEL: ArchValidateMCOperandForUncompress
-// CHECK: // simm12
-// CHECK: return isInt<12>(Imm);
+// CHECK: // simm6
+// CHECK: return isInt<6>(Imm);
 
 // CHECK-LABEL: uncompressInst
 // CHECK: case Arch::SmallInst:
 // CHECK-NEXT:  if (MI.getOperand(0).isReg() &&
-// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsRegClassID].contains(MI.getOperand(0).getReg()) &&
+// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(0).getReg()) &&
 // CHECK-NEXT: MI.getOperand(1).isReg() &&
-// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsRegClassID].contains(MI.getOperand(1).getReg()) &&
-// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 1)) {
+// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
+// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 1 /* simm6 */) &&
+// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 2 /* simm12 */))
 // CHECK-NEXT: // big $dst, $addr
 // CHECK-NEXT: OutInst.setOpcode(Arch::BigInst);
 // CHECK-NEXT: // Operand: dst
@@ -183,10 +184,11 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT: } // if
 // CHECK: case Arch::SmallInst2:
 // CHECK-NEXT:  if (MI.getOperand(0).isReg() &&
-// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsRegClassID].contains(MI.getOperand(0).getReg()) &&
+// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(0).getReg()) &&
 // CHECK-NEXT: MI.getOperand(1).isReg() &&
-// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsRegClassID].contains(MI.getOperand(1).getReg()) &&
-// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 1)) {
+// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
+// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 1 /* simm6 */) &&
+// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 2 /* simm12 */)) {
 // CHECK-NEXT: // big $dst, $addr
 // CHECK-NEXT: OutInst.setOpcode(Arch::BigInst2);
 // CHECK-NEXT: // Operand: dst
@@ -199,10 +201,11 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT: } // if
 // CHECK: case Arch::SmallInst3:
 // CHECK-NEXT:  if (MI.getOperand(0).isReg() &&
-// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsRegClassID].contains(MI.getOperand(0).getReg()) &&
+// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(0).getReg()) &&
 // CHECK-NEXT: MI.getOperand(1).isReg() &&
-// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsRegClassID].contains(MI.getOperand(1).getReg()) &&
-// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 1)) {
+// CHECK-NEXT: ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
+// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 1 /* simm6 */) &&
+// CHECK-NEXT: ArchValidateMCOperandForUncompress(MI.getOperand(2), STI, 2 /* simm12 */)) {
 // CHECK-NEXT: // big $dst, $src, $imm
 // CHECK-NEXT: OutInst.setOpcode(Arch::BigInst3);
 // CHECK-NEXT: // Operand: dst
@@ -226,7 +229,7 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT:    MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isPhysical() &&
 // CHECK-NEXT:    ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
 // CHECK-NEXT:    MI.getOperand(2).isImm() &&
-// CHECK-NEXT:    ArchValidateMachineOperand(MI.getOperand(2), &STI, 1)) {
+// CHECK-NEXT:    ArchValidateMachineOperand(MI.getOperand(2), &STI, 1 /* simm6 */)) {
 // CHECK-NEXT:    // small $dst, $addr
 // CHECK-NEXT:    // Operand: dst
 // CHECK-NEXT:    // Operand: addr
@@ -238,7 +241,7 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT:    MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isPhysical() &&
 // CHECK-NEXT:    ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
 // CHECK-NEXT:    MI.getOperand(2).isImm() &&
-// CHECK-NEXT:    ArchValidateMachineOperand(MI.getOperand(2), &STI, 1)) {
+// CHECK-NEXT:    ArchValidateMachineOperand(MI.getOperand(2), &STI, 1 /* simm6 */)) {
 // CHECK-NEXT:    // small $dst, $src, $imm
 // CHECK-NEXT:    // Operand: dst
 // CHECK-NEXT:    // Operand: src
@@ -251,7 +254,7 @@ def : CompressPat<(BigInst3 RegsC:$dst, RegsC:$src, simm6:$imm),
 // CHECK-NEXT:    MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isPhysical() &&
 // CHECK-NEXT:    ArchMCRegisterClasses[Arch::RegsCRegClassID].contains(MI.getOperand(1).getReg()) &&
 // CHECK-NEXT:    MI.getOperand(2).isImm() &&
-// CHECK-NEXT:    ArchValidateMachineOperand(MI.getOperand(2), &STI, 1)) {
+// CHECK-NEXT:    ArchValidateMachineOperand(MI.getOperand(2), &STI, 1 /* simm6 */)) {
 // CHECK-NEXT:    // small $dst, $addr
 // CHECK-NEXT:    // Operand: dst
 // CHECK-NEXT:    // Operand: addr
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 579e3c7..783a861 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -95,8 +95,8 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:  __lshrdi3 = 4, // __lshrdi3
 // CHECK-NEXT:  bzero = 5, // bzero
 // CHECK-NEXT:  calloc = 6, // calloc
-// CHECK-NEXT:  sqrtl_f80 = 7, // sqrtl
-// CHECK-NEXT:  sqrtl_f128 = 8, // sqrtl
+// CHECK-NEXT:  sqrtl_f128 = 7, // sqrtl
+// CHECK-NEXT:  sqrtl_f80 = 8, // sqrtl
 // CHECK-NEXT:  NumLibcallImpls = 9
 // CHECK-NEXT: };
 // CHECK-NEXT: } // End namespace RTLIB
@@ -157,8 +157,8 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: RTLIB::SRL_I64, // RTLIB::__lshrdi3
 // CHECK-NEXT: RTLIB::BZERO, // RTLIB::bzero
 // CHECK-NEXT: RTLIB::CALLOC, // RTLIB::calloc
-// CHECK-NEXT: RTLIB::SQRT_F80, // RTLIB::sqrtl_f80
 // CHECK-NEXT: RTLIB::SQRT_F128, // RTLIB::sqrtl_f128
+// CHECK-NEXT: RTLIB::SQRT_F80, // RTLIB::sqrtl_f80
 // CHECK-NEXT: };
 
 
diff --git a/llvm/test/TableGen/getsetop.td b/llvm/test/TableGen/getsetop.td
index aac644f..031606f 100644
--- a/llvm/test/TableGen/getsetop.td
+++ b/llvm/test/TableGen/getsetop.td
@@ -28,6 +28,7 @@ def bob : Super;
 def test {
   dag orig = (foo 1, 2:$a, $b);
   dag another = (qux "hello", $world);
+  dag named = (foo:$root 1, 2:$a, $b);
 
   // CHECK: dag replaceWithBar = (bar 1, 2:$a, ?:$b);
   dag replaceWithBar = !setop(orig, bar);
@@ -41,6 +42,19 @@ def test {
   // CHECK: dag getopToSetop = (foo "hello", ?:$world);
   dag getopToSetop = !setdagop(another, !getdagop(orig));
 
+  // CHECK: dag setOpName = (foo:$baz 1, 2:$a, ?:$b);
+  dag setOpName = !setdagopname(orig, "baz");
+
+  // CHECK: dag getopNameToSetOpName = (foo:$root 1, 2:$a, ?:$b);
+  dag getopNameToSetOpName = !setdagopname(orig, !getdagopname(named));
+
+  // CHECK: dag setOpNameExpl = (foo:$baz 1, 2:$a, ?:$b);
+  dag setOpNameExpl = !setdagopname((foo 1, 2:$a, $b), "baz");
+
+  // CHECK: dag getopNameToSetOpNameExpl = (foo:$root 1, 2:$a, ?:$b);
+  dag getopNameToSetOpNameExpl =
+    !setdagopname(orig, !getdagopname((foo:$root 1, 2:$a, $b)));
+
   // CHECK: dag getopToBangDag = (foo 1:$a, 2:$b, 3:$c);
   dag getopToBangDag = !dag(!getdagop(orig), [1, 2, 3], ["a", "b", "c"]);
 
diff --git a/llvm/test/TableGen/unsetop.td b/llvm/test/TableGen/unsetop.td
index 7a4f98a..54ede19 100644
--- a/llvm/test/TableGen/unsetop.td
+++ b/llvm/test/TableGen/unsetop.td
@@ -16,6 +16,12 @@ def test {
   dag undefSecond  = !con((op 1), (?  2));
   // CHECK: dag undefBoth = (? 1, 2);
   dag undefBoth    = !con((?  1), (?  2));
+  // CHECK: dag namedLHS = (op:$lhs 1, 2);
+  dag namedLHS = !con((op:$lhs 1), (op 2));
+  // CHECK: dag namedRHS = (op:$rhs 1, 2);
+  dag namedRHS = !con((op 1), (op:$rhs 2));
+  // CHECK: dag namedBoth = (op:$lhs 1, 2);
+  dag namedBoth = !con((op:$lhs 1), (op:$rhs 2));
 
 #ifdef ERROR
   // ERROR: Concatenated Dag operators do not match: '(op 1)' vs. '(otherop 2)'
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index c5eedb2..757f6e9 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -52,6 +52,7 @@
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+; RUN:	cat %t.ccg.clonefuncassign.dot | FileCheck %s --check-prefix=DOTFUNCASSIGN
 
 ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
 
@@ -370,6 +371,10 @@ attributes #0 = { noinline optnone }
 ; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
 ; DOTCLONED: }
 
+;; Here we are just ensuring that the post-function assign dot graph includes
+;; clone information for both the caller and callee in the node labels.
+; DOTFUNCASSIGN: _Z3bazv.memprof.1 -\> _Z3barv.memprof.1
+; DOTFUNCASSIGN: _Z3barv.memprof.1 -\> alloc
 
 ; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 1807954217441101578, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1)
 ; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 8107868197919466657, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1)
diff --git a/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll b/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll
new file mode 100644
index 0000000..8303d6d
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll
@@ -0,0 +1,145 @@
+;; Make sure we assign the original callsite to a function clone (which will be
+;; the original function clone), even when we cannot update its caller (due to
+;; missing metadata e.g. from mismatched profiles). Otherwise we will try to use
+;; the original function for a different clone, leading to confusion later when
+;; rewriting the calls.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,B,plx \
+; RUN:  -r=%t.o,C,plx \
+; RUN:  -r=%t.o,D,plx \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,F,plx \
+; RUN:  -r=%t.o,G,plx \
+; RUN:  -r=%t.o,A1,plx \
+; RUN:  -r=%t.o,B1,plx \
+; RUN:  -r=%t.o,_Znwm, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -debug-only=memprof-context-disambiguation \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s \
+; RUN:	--implicit-check-not="Mismatch in call clone assignment" \
+; RUN:	--implicit-check-not="Number of callsites assigned to call multiple non-matching clones"
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+; ModuleID = '<stdin>'
+source_filename = "reduced.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; IR-LABEL: define dso_local void @A()
+define void @A() #0 {
+  ; IR: call void @C()
+  call void @C()
+  ret void
+}
+
+; IR-LABEL: define dso_local void @B()
+define void @B() #0 {
+  ; IR: call void @C.memprof.1()
+  call void @C(), !callsite !1
+  ret void
+}
+
+; IR-LABEL: define dso_local void @C()
+define void @C() #0 {
+  ; IR: call void @F()
+  call void @F(), !callsite !16
+  ; IR: call void @D()
+  call void @D(), !callsite !2
+  ret void
+}
+
+; IR-LABEL: define dso_local void @D()
+define void @D() #0 {
+  ; IR: call void @E()
+  call void @E(), !callsite !3
+  ; IR: call void @G()
+  call void @G(), !callsite !17
+  ret void
+}
+
+; IR-LABEL: define dso_local void @E()
+define void @E() #0 {
+  ; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD:[0-9]+]]
+  %1 = call ptr @_Znwm(i64 0), !memprof !4, !callsite !9
+  ret void
+}
+
+; IR-LABEL: define dso_local void @F()
+define void @F() #0 {
+  ; IR: call void @G()
+  call void @G(), !callsite !17
+  ret void
+}
+
+; IR-LABEL: define dso_local void @G()
+define void @G() #0 {
+  ; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD]]
+  %2 = call ptr @_Znwm(i64 0), !memprof !10, !callsite !15
+  ret void
+}
+
+; IR-LABEL: define dso_local void @A1()
+define void @A1() #0 {
+  ; IR: call void @C()
+  call void @C(), !callsite !18
+  ret void
+}
+
+; IR-LABEL: define dso_local void @B1()
+define void @B1() #0 {
+  ; IR: call void @C.memprof.1()
+  call void @C(), !callsite !19
+  ret void
+}
+
+; IR-LABEL: define dso_local void @C.memprof.1()
+  ; IR: call void @F.memprof.1()
+  ; IR: call void @D.memprof.1()
+
+; IR-LABEL: define dso_local void @D.memprof.1()
+  ; IR: call void @E.memprof.1()
+  ; IR: call void @G()
+
+; IR-LABEL: define dso_local void @E.memprof.1()
+  ; IR: call ptr @_Znwm(i64 0) #[[COLD:[0-9]+]]
+
+; IR-LABEL: define dso_local void @F.memprof.1()
+  ; IR: call void @G.memprof.1()
+
+; IR-LABEL: define dso_local void @G.memprof.1()
+  ; IR: call ptr @_Znwm(i64 0) #[[COLD]]
+
+declare ptr @_Znwm(i64)
+
+attributes #0 = { noinline optnone }
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+!0 = !{i64 123}
+!1 = !{i64 234}
+!2 = !{i64 345}
+!3 = !{i64 456}
+!4 = !{!5, !7}
+!5 = !{!6, !"notcold"}
+!6 = !{i64 567, i64 456, i64 345, i64 123}
+!7 = !{!8, !"cold"}
+!8 = !{i64 567, i64 456, i64 345, i64 234}
+!9 = !{i64 567}
+!10 = !{!11, !13}
+!11 = !{!12, !"notcold"}
+!12 = !{i64 678, i64 891, i64 789, i64 912}
+!13 = !{!14, !"cold"}
+!14 = !{i64 678, i64 891, i64 789, i64 812}
+!15 = !{i64 678}
+!16 = !{i64 789}
+!17 = !{i64 891}
+!18 = !{i64 912}
+!19 = !{i64 812}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index 7abc32e..f53127f 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -1065,3 +1065,37 @@ for.body:
   %exitcond.not = icmp eq i32 %inc, %N
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+define i64 @pr150611_add_offset_is_not_loop_invariant(i1 %cond) {
+; CHECK-LABEL: define i64 @pr150611_add_offset_is_not_loop_invariant(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[REMAMT:%.*]] = select i1 [[COND]], i64 2, i64 0
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD_OFFSET:%.*]] = zext i1 [[COND]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[INDVARS]], [[ADD_OFFSET]]
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[ADD]], [[REMAMT]]
+; CHECK-NEXT:    [[INDVARS_NEXT]] = add nuw i64 [[INDVARS]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret i64 [[REM]]
+;
+entry:
+  %remamt = select i1 %cond, i64 2, i64 0
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ]
+  %add.offset = zext i1 %cond to i64
+  %add = add nuw i64 %indvars, %add.offset
+  %rem = urem i64 %add, %remamt
+  %indvars.next = add nuw i64 %indvars, 1
+  %exitcond = icmp eq i64 %indvars.next, 3
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret i64 %rem
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/noalias.ll b/llvm/test/Transforms/FunctionAttrs/noalias.ll
new file mode 100644
index 0000000..de8bd9e
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/noalias.ll
@@ -0,0 +1,245 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=function-attrs < %s | FileCheck %s
+
+declare noalias ptr @malloc(i64 %size)
+declare ptr @not_malloc(i64 %size)
+declare void @capture(ptr)
+
+@g = external global i8
+
+define ptr @return_malloc(i64 %size) {
+; CHECK-LABEL: define noalias ptr @return_malloc(
+; CHECK-SAME: i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = call ptr @malloc(i64 %size)
+  ret ptr %a
+}
+
+define ptr @return_not_malloc(i64 %size) {
+; CHECK-LABEL: define ptr @return_not_malloc(
+; CHECK-SAME: i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call ptr @not_malloc(i64 [[SIZE]])
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = call ptr @not_malloc(i64 %size)
+  ret ptr %a
+}
+
+define ptr @return_null() {
+; CHECK-LABEL: define noalias noundef ptr @return_null(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret ptr null
+;
+  ret ptr null
+}
+
+define ptr @return_poison() {
+; CHECK-LABEL: define noalias ptr @return_poison(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    ret ptr poison
+;
+  ret ptr poison
+}
+
+define ptr @return_alloca() {
+; CHECK-LABEL: define noalias noundef nonnull ptr @return_alloca(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = alloca i8
+  ret ptr %a
+}
+
+; noalias arg does not imply noalias return
+define ptr @return_noalias_arg(ptr noalias %arg) {
+; CHECK-LABEL: define ptr @return_noalias_arg(
+; CHECK-SAME: ptr noalias readnone returned captures(ret: address, provenance) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret ptr [[ARG]]
+;
+  ret ptr %arg
+}
+
+define ptr @return_global() {
+; CHECK-LABEL: define noundef nonnull ptr @return_global(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    ret ptr @g
+;
+  ret ptr @g
+}
+
+define ptr @no_return() {
+; CHECK-LABEL: define noalias noundef nonnull ptr @no_return(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    unreachable
+;
+  unreachable
+}
+
+define ptr @return_multiple(i1 %c, i64 %size) {
+; CHECK-LABEL: define noalias ptr @return_multiple(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    ret ptr [[A]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[B:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    ret ptr [[B]]
+;
+br i1 %c, label %if, label %else
+
+if:
+  %a = call ptr @malloc(i64 %size)
+  ret ptr %a
+
+else:
+  %b = call ptr @malloc(i64 %size)
+  ret ptr %b
+}
+
+define ptr @return_select(i1 %c, i64 %size) {
+; CHECK-LABEL: define noalias ptr @return_select(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C]], ptr [[A]], ptr null
+; CHECK-NEXT:    ret ptr [[SEL]]
+;
+  %a = call ptr @malloc(i64 %size)
+  %sel = select i1 %c, ptr %a, ptr null
+  ret ptr %sel
+}
+
+define ptr @return_phi(i1 %c, i64 %size) {
+; CHECK-LABEL: define noalias ptr @return_phi(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], %[[IF]] ], [ null, %[[ELSE]] ]
+; CHECK-NEXT:    ret ptr [[PHI]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %a = call ptr @malloc(i64 %size)
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi ptr [ %a, %if ], [ null, %else ]
+  ret ptr %phi
+}
+
+define ptr @return_phi_wrong(i1 %c, i64 %size) {
+; CHECK-LABEL: define ptr @return_phi_wrong(
+; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[B:%.*]] = call ptr @not_malloc(i64 [[SIZE]])
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], %[[IF]] ], [ [[B]], %[[ELSE]] ]
+; CHECK-NEXT:    ret ptr [[PHI]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %a = call ptr @malloc(i64 %size)
+  br label %join
+
+else:
+  %b = call ptr @not_malloc(i64 %size)
+  br label %join
+
+join:
+  %phi = phi ptr [ %a, %if ], [ %b, %else ]
+  ret ptr %phi
+}
+
+define ptr @return_malloc_with_store(i64 %size) {
+; CHECK-LABEL: define noalias noundef ptr @return_malloc_with_store(
+; CHECK-SAME: i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = call ptr @malloc(i64 %size)
+  store i8 0, ptr %a
+  ret ptr %a
+}
+
+define ptr @return_malloc_captured(i64 %size) {
+; CHECK-LABEL: define ptr @return_malloc_captured(
+; CHECK-SAME: i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 [[SIZE]])
+; CHECK-NEXT:    call void @capture(ptr [[A]])
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = call ptr @malloc(i64 %size)
+  call void @capture(ptr %a)
+  ret ptr %a
+}
+
+define ptr @scc1(i1 %c) {
+; CHECK-LABEL: define noalias ptr @scc1(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @malloc(i64 4)
+; CHECK-NEXT:    ret ptr [[A]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[B:%.*]] = call ptr @scc2(i1 [[C]])
+; CHECK-NEXT:    ret ptr [[B]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %a = call ptr @malloc(i64 4)
+  ret ptr %a
+
+else:
+  %b = call ptr @scc2(i1 %c)
+  ret ptr %b
+}
+
+define ptr @scc2(i1 %c) {
+; CHECK-LABEL: define noalias ptr @scc2(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call ptr @scc1(i1 [[C]])
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = call ptr @scc1(i1 %c)
+  ret ptr %a
+}
+
+define ptr @return_unknown_call(ptr %fn) {
+; CHECK-LABEL: define ptr @return_unknown_call(
+; CHECK-SAME: ptr readonly captures(none) [[FN:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call ptr [[FN]]()
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = call ptr %fn()
+  ret ptr %a
+}
+
+define ptr @return_unknown_noalias_call(ptr %fn) {
+; CHECK-LABEL: define noalias ptr @return_unknown_noalias_call(
+; CHECK-SAME: ptr readonly captures(none) [[FN:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call noalias ptr [[FN]]()
+; CHECK-NEXT:    ret ptr [[A]]
+;
+  %a = call noalias ptr %fn()
+  ret ptr %a
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/nofree.ll b/llvm/test/Transforms/FunctionAttrs/nofree.ll
index 1671189..89f030d 100644
--- a/llvm/test/Transforms/FunctionAttrs/nofree.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nofree.ll
@@ -156,6 +156,24 @@ entry:
   ret void
 }
 
+define void @unknown_call(ptr %fn) {
+; CHECK-LABEL: @unknown_call(
+; CHECK-NEXT:    call void [[FN:%.*]]()
+; CHECK-NEXT:    ret void
+;
+  call void %fn()
+  ret void
+}
+
+define void @unknown_nofree_call(ptr %fn) {
+; CHECK-LABEL: @unknown_nofree_call(
+; CHECK-NEXT:    call void [[FN:%.*]]() #[[ATTR5]]
+; CHECK-NEXT:    ret void
+;
+  call void %fn() nofree
+  ret void
+}
+
 declare void @_ZdaPv(ptr) local_unnamed_addr #4
 
 attributes #0 = { uwtable }
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index 483b560..8df242f 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -1396,5 +1396,35 @@ define ptr @pr91177_non_inbounds_gep(ptr nonnull %arg) {
   ret ptr %res
 }
 
+define ptr @unknown_func(ptr %fn) {
+; FNATTRS-LABEL: define ptr @unknown_func(
+; FNATTRS-SAME: ptr readonly captures(none) [[FN:%.*]]) {
+; FNATTRS-NEXT:    [[RES:%.*]] = call ptr [[FN]]()
+; FNATTRS-NEXT:    ret ptr [[RES]]
+;
+; ATTRIBUTOR-LABEL: define ptr @unknown_func(
+; ATTRIBUTOR-SAME: ptr nofree nonnull captures(none) [[FN:%.*]]) {
+; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call ptr [[FN]]()
+; ATTRIBUTOR-NEXT:    ret ptr [[RES]]
+;
+  %res = call ptr %fn()
+  ret ptr %res
+}
+
+define ptr @unknown_nonnull_func(ptr %fn) {
+; FNATTRS-LABEL: define nonnull ptr @unknown_nonnull_func(
+; FNATTRS-SAME: ptr readonly captures(none) [[FN:%.*]]) {
+; FNATTRS-NEXT:    [[RES:%.*]] = call nonnull ptr [[FN]]()
+; FNATTRS-NEXT:    ret ptr [[RES]]
+;
+; ATTRIBUTOR-LABEL: define nonnull ptr @unknown_nonnull_func(
+; ATTRIBUTOR-SAME: ptr nofree nonnull captures(none) [[FN:%.*]]) {
+; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call nonnull ptr [[FN]]()
+; ATTRIBUTOR-NEXT:    ret ptr [[RES]]
+;
+  %res = call nonnull ptr %fn()
+  ret ptr %res
+}
+
 attributes #0 = { null_pointer_is_valid }
 attributes #1 = { nounwind willreturn}
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
index 7a089f6..5cb8ac0 100644
--- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
@@ -241,6 +241,37 @@ define void @r() norecurse {
   call void @q()
   ret void
 }
+
+define void @unknown_call(ptr %fn) {
+; FNATTRS-LABEL: define {{[^@]+}}@unknown_call
+; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) {
+; FNATTRS-NEXT:    call void [[FN]]()
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_call
+; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) {
+; ATTRIBUTOR-NEXT:    call void [[FN]]()
+; ATTRIBUTOR-NEXT:    ret void
+;
+  call void %fn()
+  ret void
+}
+
+define void @unknown_norecurse_call(ptr %fn) {
+; FNATTRS-LABEL: define {{[^@]+}}@unknown_norecurse_call
+; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) {
+; FNATTRS-NEXT:    call void [[FN]]() #[[ATTR7:[0-9]+]]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_norecurse_call
+; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) {
+; ATTRIBUTOR-NEXT:    call void [[FN]]() #[[ATTR9:[0-9]+]]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  call void %fn() norecurse
+  ret void
+}
+
 ;.
 ; FNATTRS: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; FNATTRS: attributes #[[ATTR1]] = { nofree nosync nounwind memory(none) }
@@ -249,6 +280,7 @@ define void @r() norecurse {
 ; FNATTRS: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
 ; FNATTRS: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; FNATTRS: attributes #[[ATTR6]] = { nofree norecurse nosync memory(none) }
+; FNATTRS: attributes #[[ATTR7]] = { norecurse }
 ;.
 ; ATTRIBUTOR: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; ATTRIBUTOR: attributes #[[ATTR1]] = { nofree nosync nounwind memory(none) }
@@ -259,6 +291,7 @@ define void @r() norecurse {
 ; ATTRIBUTOR: attributes #[[ATTR6]] = { norecurse nosync memory(none) }
 ; ATTRIBUTOR: attributes #[[ATTR7]] = { nosync }
 ; ATTRIBUTOR: attributes #[[ATTR8]] = { nofree willreturn }
+; ATTRIBUTOR: attributes #[[ATTR9]] = { norecurse }
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; COMMON: {{.*}}
diff --git a/llvm/test/Transforms/FunctionAttrs/nounwind.ll b/llvm/test/Transforms/FunctionAttrs/nounwind.ll
index afa9ae3..076a7df 100644
--- a/llvm/test/Transforms/FunctionAttrs/nounwind.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nounwind.ll
@@ -4,10 +4,15 @@
 
 ; TEST 1
 define i32 @foo1() {
-; COMMON: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; COMMON-LABEL: define {{[^@]+}}@foo1
-; COMMON-SAME: () #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT:    ret i32 1
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; FNATTRS-LABEL: define {{[^@]+}}@foo1
+; FNATTRS-SAME: () #[[ATTR0:[0-9]+]] {
+; FNATTRS-NEXT:    ret i32 1
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@foo1
+; ATTRIBUTOR-SAME: () #[[ATTR0:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    ret i32 1
 ;
   ret i32 1
 }
@@ -70,14 +75,23 @@ define void @call_non_nounwind(){
 ; }
 
 define i32 @maybe_throw(i1 zeroext %0) {
-; COMMON-LABEL: define {{[^@]+}}@maybe_throw
-; COMMON-SAME: (i1 zeroext [[TMP0:%.*]]) {
-; COMMON-NEXT:    br i1 [[TMP0]], label [[TMP2:%.*]], label [[TMP3:%.*]]
-; COMMON:       2:
-; COMMON-NEXT:    tail call void @__cxa_rethrow()
-; COMMON-NEXT:    unreachable
-; COMMON:       3:
-; COMMON-NEXT:    ret i32 -1
+; FNATTRS-LABEL: define {{[^@]+}}@maybe_throw
+; FNATTRS-SAME: (i1 zeroext [[TMP0:%.*]]) {
+; FNATTRS-NEXT:    br i1 [[TMP0]], label [[TMP2:%.*]], label [[TMP3:%.*]]
+; FNATTRS:       2:
+; FNATTRS-NEXT:    tail call void @__cxa_rethrow()
+; FNATTRS-NEXT:    unreachable
+; FNATTRS:       3:
+; FNATTRS-NEXT:    ret i32 -1
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@maybe_throw
+; ATTRIBUTOR-SAME: (i1 zeroext [[TMP0:%.*]]) {
+; ATTRIBUTOR-NEXT:    br i1 [[TMP0]], label [[TMP2:%.*]], label [[TMP3:%.*]]
+; ATTRIBUTOR:       2:
+; ATTRIBUTOR-NEXT:    tail call void @__cxa_rethrow()
+; ATTRIBUTOR-NEXT:    unreachable
+; ATTRIBUTOR:       3:
+; ATTRIBUTOR-NEXT:    ret i32 -1
 ;
   br i1 %0, label %2, label %3
 
@@ -101,18 +115,31 @@ declare void @__cxa_rethrow()
 ; }
 
 define i32 @catch_thing() personality ptr @__gxx_personality_v0 {
-; COMMON-LABEL: define {{[^@]+}}@catch_thing() personality ptr @__gxx_personality_v0 {
-; COMMON-NEXT:    invoke void @__cxa_rethrow()
-; COMMON-NEXT:    to label [[TMP1:%.*]] unwind label [[TMP2:%.*]]
-; COMMON:       1:
-; COMMON-NEXT:    unreachable
-; COMMON:       2:
-; COMMON-NEXT:    [[TMP3:%.*]] = landingpad { ptr, i32 }
-; COMMON-NEXT:    catch ptr null
-; COMMON-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0
-; COMMON-NEXT:    [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]])
-; COMMON-NEXT:    tail call void @__cxa_end_catch()
-; COMMON-NEXT:    ret i32 -1
+; FNATTRS-LABEL: define {{[^@]+}}@catch_thing() personality ptr @__gxx_personality_v0 {
+; FNATTRS-NEXT:    invoke void @__cxa_rethrow()
+; FNATTRS-NEXT:            to label [[TMP1:%.*]] unwind label [[TMP2:%.*]]
+; FNATTRS:       1:
+; FNATTRS-NEXT:    unreachable
+; FNATTRS:       2:
+; FNATTRS-NEXT:    [[TMP3:%.*]] = landingpad { ptr, i32 }
+; FNATTRS-NEXT:            catch ptr null
+; FNATTRS-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0
+; FNATTRS-NEXT:    [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]])
+; FNATTRS-NEXT:    tail call void @__cxa_end_catch()
+; FNATTRS-NEXT:    ret i32 -1
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@catch_thing() personality ptr @__gxx_personality_v0 {
+; ATTRIBUTOR-NEXT:    invoke void @__cxa_rethrow()
+; ATTRIBUTOR-NEXT:            to label [[TMP1:%.*]] unwind label [[TMP2:%.*]]
+; ATTRIBUTOR:       1:
+; ATTRIBUTOR-NEXT:    unreachable
+; ATTRIBUTOR:       2:
+; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = landingpad { ptr, i32 }
+; ATTRIBUTOR-NEXT:            catch ptr null
+; ATTRIBUTOR-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0
+; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]])
+; ATTRIBUTOR-NEXT:    tail call void @__cxa_end_catch()
+; ATTRIBUTOR-NEXT:    ret i32 -1
 ;
   invoke void @__cxa_rethrow() #1
   to label %1 unwind label %2
@@ -130,9 +157,13 @@ define i32 @catch_thing() personality ptr @__gxx_personality_v0 {
 }
 
 define i32 @catch_thing_user() {
-; COMMON-LABEL: define {{[^@]+}}@catch_thing_user() {
-; COMMON-NEXT:    [[CATCH_THING_CALL:%.*]] = call i32 @catch_thing()
-; COMMON-NEXT:    ret i32 [[CATCH_THING_CALL]]
+; FNATTRS-LABEL: define {{[^@]+}}@catch_thing_user() {
+; FNATTRS-NEXT:    [[CATCH_THING_CALL:%.*]] = call i32 @catch_thing()
+; FNATTRS-NEXT:    ret i32 [[CATCH_THING_CALL]]
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@catch_thing_user() {
+; ATTRIBUTOR-NEXT:    [[CATCH_THING_CALL:%.*]] = call i32 @catch_thing()
+; ATTRIBUTOR-NEXT:    ret i32 [[CATCH_THING_CALL]]
 ;
   %catch_thing_call = call i32 @catch_thing()
   ret i32 %catch_thing_call
@@ -147,10 +178,10 @@ define void @catch_specific_landingpad() personality ptr @__gxx_personality_v0 {
 ; COMMON-LABEL: define {{[^@]+}}@catch_specific_landingpad
 ; COMMON-SAME: () #[[ATTR3:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; COMMON-NEXT:    invoke void @do_throw()
-; COMMON-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
+; COMMON-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
 ; COMMON:       lpad:
 ; COMMON-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; COMMON-NEXT:    catch ptr @catch_ty
+; COMMON-NEXT:            catch ptr @catch_ty
 ; COMMON-NEXT:    call void @abort()
 ; COMMON-NEXT:    unreachable
 ; COMMON:       unreachable:
@@ -174,10 +205,10 @@ define void @catch_all_landingpad() personality ptr @__gxx_personality_v0 {
 ; COMMON-LABEL: define {{[^@]+}}@catch_all_landingpad
 ; COMMON-SAME: () #[[ATTR4:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; COMMON-NEXT:    invoke void @do_throw()
-; COMMON-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
+; COMMON-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
 ; COMMON:       lpad:
 ; COMMON-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; COMMON-NEXT:    catch ptr null
+; COMMON-NEXT:            catch ptr null
 ; COMMON-NEXT:    call void @abort()
 ; COMMON-NEXT:    unreachable
 ; COMMON:       unreachable:
@@ -201,10 +232,10 @@ define void @filter_specific_landingpad() personality ptr @__gxx_personality_v0
 ; COMMON-LABEL: define {{[^@]+}}@filter_specific_landingpad
 ; COMMON-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 ; COMMON-NEXT:    invoke void @do_throw()
-; COMMON-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
+; COMMON-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
 ; COMMON:       lpad:
 ; COMMON-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; COMMON-NEXT:    filter [1 x ptr] [ptr @catch_ty]
+; COMMON-NEXT:            filter [1 x ptr] [ptr @catch_ty]
 ; COMMON-NEXT:    call void @abort()
 ; COMMON-NEXT:    unreachable
 ; COMMON:       unreachable:
@@ -228,10 +259,10 @@ define void @filter_none_landingpad() personality ptr @__gxx_personality_v0 {
 ; COMMON-LABEL: define {{[^@]+}}@filter_none_landingpad
 ; COMMON-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 {
 ; COMMON-NEXT:    invoke void @do_throw()
-; COMMON-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
+; COMMON-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
 ; COMMON:       lpad:
 ; COMMON-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; COMMON-NEXT:    filter [0 x ptr] zeroinitializer
+; COMMON-NEXT:            filter [0 x ptr] zeroinitializer
 ; COMMON-NEXT:    call void @abort()
 ; COMMON-NEXT:    unreachable
 ; COMMON:       unreachable:
@@ -255,10 +286,10 @@ define void @cleanup_landingpad() personality ptr @__gxx_personality_v0 {
 ; COMMON-LABEL: define {{[^@]+}}@cleanup_landingpad
 ; COMMON-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 ; COMMON-NEXT:    invoke void @do_throw()
-; COMMON-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
+; COMMON-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]]
 ; COMMON:       lpad:
 ; COMMON-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; COMMON-NEXT:    cleanup
+; COMMON-NEXT:            cleanup
 ; COMMON-NEXT:    call void @abort()
 ; COMMON-NEXT:    unreachable
 ; COMMON:       unreachable:
@@ -282,7 +313,7 @@ define void @cleanuppad() personality ptr @__gxx_personality_v0 {
 ; FNATTRS-LABEL: define {{[^@]+}}@cleanuppad
 ; FNATTRS-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 ; FNATTRS-NEXT:    invoke void @do_throw()
-; FNATTRS-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]]
+; FNATTRS-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]]
 ; FNATTRS:       cpad:
 ; FNATTRS-NEXT:    [[CP:%.*]] = cleanuppad within none []
 ; FNATTRS-NEXT:    call void @abort()
@@ -294,7 +325,7 @@ define void @cleanuppad() personality ptr @__gxx_personality_v0 {
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@cleanuppad
 ; ATTRIBUTOR-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 {
 ; ATTRIBUTOR-NEXT:    invoke void @do_throw()
-; ATTRIBUTOR-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]]
+; ATTRIBUTOR-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]]
 ; ATTRIBUTOR:       cpad:
 ; ATTRIBUTOR-NEXT:    [[CP:%.*]] = cleanuppad within none []
 ; ATTRIBUTOR-NEXT:    call void @abort()
@@ -319,7 +350,7 @@ define void @catchswitch_cleanuppad() personality ptr @__gxx_personality_v0 {
 ; FNATTRS-LABEL: define {{[^@]+}}@catchswitch_cleanuppad
 ; FNATTRS-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 ; FNATTRS-NEXT:    invoke void @do_throw()
-; FNATTRS-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]]
+; FNATTRS-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]]
 ; FNATTRS:       cs:
 ; FNATTRS-NEXT:    [[TOK:%.*]] = catchswitch within none [label %catch] unwind label [[CPAD:%.*]]
 ; FNATTRS:       catch:
@@ -337,7 +368,7 @@ define void @catchswitch_cleanuppad() personality ptr @__gxx_personality_v0 {
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@catchswitch_cleanuppad
 ; ATTRIBUTOR-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 {
 ; ATTRIBUTOR-NEXT:    invoke void @do_throw()
-; ATTRIBUTOR-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]]
+; ATTRIBUTOR-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]]
 ; ATTRIBUTOR:       cs:
 ; ATTRIBUTOR-NEXT:    [[TOK:%.*]] = catchswitch within none [label %catch] unwind label [[CPAD:%.*]]
 ; ATTRIBUTOR:       catch:
@@ -371,6 +402,38 @@ unreachable:
   unreachable
 }
 
+define void @unknown_call(ptr %fn) {
+; FNATTRS-LABEL: define {{[^@]+}}@unknown_call
+; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) {
+; FNATTRS-NEXT:    call void [[FN]]()
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_call
+; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) {
+; ATTRIBUTOR-NEXT:    call void [[FN]]()
+; ATTRIBUTOR-NEXT:    ret void
+;
+  call void %fn()
+  ret void
+}
+
+define void @unknown_nounwind_call(ptr %fn) {
+; FNATTRS: Function Attrs: nounwind
+; FNATTRS-LABEL: define {{[^@]+}}@unknown_nounwind_call
+; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) #[[ATTR2:[0-9]+]] {
+; FNATTRS-NEXT:    call void [[FN]]() #[[ATTR2]]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: nounwind
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_nounwind_call
+; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) #[[ATTR2:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    call void [[FN]]() #[[ATTR2]]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  call void %fn() nounwind
+  ret void
+}
+
 declare i32 @__gxx_personality_v0(...)
 
 declare ptr @__cxa_begin_catch(ptr)
diff --git a/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll b/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll
index 4d5db32..04575e4 100644
--- a/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll
+++ b/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll
@@ -50,10 +50,12 @@ define internal i32 @sendmsg_rtn_is_norecurse() {
 }
 
 define void @user() {
-; FNATTRS-LABEL: define void @user() {
+; FNATTRS: Function Attrs: norecurse nounwind
+; FNATTRS-LABEL: define void @user(
+; FNATTRS-SAME: ) #[[ATTR1]] {
 ; FNATTRS-NEXT:    call void @sendmsg_is_norecurse()
 ; FNATTRS-NEXT:    call void @sendmsghalt_is_norecurse()
-; FNATTRS-NEXT:    call void @sendmsg_rtn_is_norecurse()
+; FNATTRS-NEXT:    [[TMP1:%.*]] = call i32 @sendmsg_rtn_is_norecurse()
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: norecurse nounwind
@@ -61,12 +63,12 @@ define void @user() {
 ; ATTRIBUTOR-SAME: ) #[[ATTR1]] {
 ; ATTRIBUTOR-NEXT:    call void @sendmsg_is_norecurse() #[[ATTR5:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    call void @sendmsghalt_is_norecurse() #[[ATTR6:[0-9]+]]
-; ATTRIBUTOR-NEXT:    call void @sendmsg_rtn_is_norecurse() #[[ATTR6]]
+; ATTRIBUTOR-NEXT:    [[TMP1:%.*]] = call i32 @sendmsg_rtn_is_norecurse() #[[ATTR6]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @sendmsg_is_norecurse()
   call void @sendmsghalt_is_norecurse()
-  call void @sendmsg_rtn_is_norecurse()
+  call i32 @sendmsg_rtn_is_norecurse()
   ret void
 }
 ;.
diff --git a/llvm/test/Transforms/HipStdPar/math-fixup.ll b/llvm/test/Transforms/HipStdPar/math-fixup.ll
new file mode 100644
index 0000000..2c4622c
--- /dev/null
+++ b/llvm/test/Transforms/HipStdPar/math-fixup.ll
@@ -0,0 +1,548 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=hipstdpar-math-fixup %s | FileCheck %s
+
+define void @test_acos(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_acos(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_acos_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_acos_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.acos.f64(double %dbl)
+  %1 = call float @llvm.acos.f32(float %flt)
+  ret void
+}
+
+define void @test_acosh(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_acosh(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_acosh_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_acosh_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @acosh(double %dbl)
+  %1 = call float @acoshf(float %flt)
+  ret void
+}
+
+define void @test_asin(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_asin(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_asin_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_asin_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.asin.f64(double %dbl)
+  %1 = call float @llvm.asin.f32(float %flt)
+  ret void
+}
+
+define void @test_asinh(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_asinh(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_asinh_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_asinh_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @asinh(double %dbl)
+  %1 = call float @asinhf(float %flt)
+  ret void
+}
+
+define void @test_atan(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_atan(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_atan_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_atan_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.atan.f64(double %dbl)
+  %1 = call float @llvm.atan.f32(float %flt)
+  ret void
+}
+
+define void @test_atanh(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_atanh(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_atanh_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_atanh_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @atanh(double %dbl)
+  %1 = call float @atanhf(float %flt)
+  ret void
+}
+
+define void @test_atan2(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_atan2(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_atan2_f64(double [[DBL]], double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_atan2_f32(float [[FLT]], float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.atan2.f64(double %dbl, double %dbl)
+  %1 = call float @llvm.atan2.f32(float %flt, float %flt)
+  ret void
+}
+
+define void @test_cbrt(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_cbrt(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_cbrt_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_cbrt_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @cbrt(double %dbl)
+  %1 = call float @cbrtf(float %flt)
+  ret void
+}
+
+define void @test_cos(double %dbl) {
+; CHECK-LABEL: define void @test_cos(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_cos_f64(double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.cos.f64(double %dbl)
+  ret void
+}
+
+define void @test_cosh(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_cosh(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_cosh_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_cosh_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.cosh.f64(double %dbl)
+  %1 = call float @llvm.cosh.f32(float %flt)
+  ret void
+}
+
+define void @test_erf(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_erf(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_erf_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_erf_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @erf(double %dbl)
+  %1 = call float @erff(float %flt)
+  ret void
+}
+
+define void @test_erfc(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_erfc(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_erfc_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_erfc_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @erfc(double %dbl)
+  %1 = call float @erfcf(float %flt)
+  ret void
+}
+
+define void @test_exp(double %dbl) {
+; CHECK-LABEL: define void @test_exp(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_exp_f64(double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.exp.f64(double %dbl)
+  ret void
+}
+
+define void @test_exp2(double %dbl) {
+; CHECK-LABEL: define void @test_exp2(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_exp2_f64(double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.exp2.f64(double %dbl)
+  ret void
+}
+
+define void @test_expm1(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_expm1(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_expm1_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_expm1_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @expm1(double %dbl)
+  %1 = call float @expm1f(float %flt)
+  ret void
+}
+
+define void @test_fdim(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_fdim(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_fdim_f64(double [[DBL]], double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_fdim_f32(float [[FLT]], float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @fdim(double %dbl, double %dbl)
+  %1 = call float @fdimf(float %flt, float %flt)
+  ret void
+}
+
+define void @test_hypot(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_hypot(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_hypot_f64(double [[DBL]], double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_hypot_f32(float [[FLT]], float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @hypot(double %dbl, double %dbl)
+  %1 = call float @hypotf(float %flt, float %flt)
+  ret void
+}
+
+define void @test_lgamma(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_lgamma(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_lgamma_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_lgamma_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @lgamma(double %dbl)
+  %1 = call float @lgammaf(float %flt)
+  ret void
+}
+
+define void @test_log(double %dbl) {
+; CHECK-LABEL: define void @test_log(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_log_f64(double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.log.f64(double %dbl)
+  ret void
+}
+
+define void @test_log10(double %dbl) {
+; CHECK-LABEL: define void @test_log10(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_log10_f64(double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.log10.f64(double %dbl)
+  ret void
+}
+
+define void @test_log2(double %dbl) {
+; CHECK-LABEL: define void @test_log2(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_log2_f64(double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.log2.f64(double %dbl)
+  ret void
+}
+
+define void @test_log1p(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_log1p(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_log1p_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_log1p_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @log1p(double %dbl)
+  %1 = call float @log1pf(float %flt)
+  ret void
+}
+
+define void @test_modf(double %dbl, float %flt, ptr %pdbl, ptr %pflt) {
+; CHECK-LABEL: define void @test_modf(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]], ptr [[PDBL:%.*]], ptr [[PFLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call { double, double } @__hipstdpar_modf_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { double, double } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { double, double } [[TMP0]], 1
+; CHECK-NEXT:    store double [[TMP2]], ptr [[PDBL]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call { float, float } @__hipstdpar_modf_f32(float [[FLT]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { float, float } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { float, float } [[TMP3]], 1
+; CHECK-NEXT:    store float [[TMP5]], ptr [[PFLT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call { double, double } @llvm.modf.f64(double %dbl)
+  %1 = extractvalue { double, double } %0, 0
+  %2 = extractvalue { double, double } %0, 1
+  store double %2, ptr %pdbl, align 8
+  %3 = tail call { float, float } @llvm.modf.f32(float %flt)
+  %4 = extractvalue { float, float } %3, 0
+  %5 = extractvalue { float, float } %3, 1
+  store float %5, ptr %pflt, align 4
+  ret void
+}
+
+define void @test_pow(double %dbl) {
+; CHECK-LABEL: define void @test_pow(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_pow_f64(double [[DBL]], double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.pow.f64(double %dbl, double %dbl)
+  ret void
+}
+
+define void @test_remainder(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_remainder(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_remainder_f64(double [[DBL]], double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_remainder_f32(float [[FLT]], float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @remainder(double %dbl, double %dbl)
+  %1 = call float @remainderf(float %flt, float %flt)
+  ret void
+}
+
+define void @test_remquo(double %dbl, float %flt, ptr %p) {
+; CHECK-LABEL: define void @test_remquo(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_remquo_f64(double [[DBL]], double [[DBL]], ptr [[P]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_remquo_f32(float [[FLT]], float [[FLT]], ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @remquo(double %dbl, double %dbl, ptr %p)
+  %1 = call float @remquof(float %flt, float %flt, ptr %p)
+  ret void
+}
+
+define void @test_sin(double %dbl) {
+; CHECK-LABEL: define void @test_sin(
+; CHECK-SAME: double [[DBL:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_sin_f64(double [[DBL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.sin.f64(double %dbl)
+  ret void
+}
+
+define void @test_sinh(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_sinh(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_sinh_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_sinh_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.sinh.f64(double %dbl)
+  %1 = call float @llvm.sinh.f32(float %flt)
+  ret void
+}
+
+define void @test_tan(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_tan(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_tan_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_tan_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.tan.f64(double %dbl)
+  %1 = call float @llvm.tan.f32(float %flt)
+  ret void
+}
+
+define void @test_tanh(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_tanh(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_tanh_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_tanh_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @llvm.tanh.f64(double %dbl)
+  %1 = call float @llvm.tanh.f32(float %flt)
+  ret void
+}
+
+define void @test_tgamma(double %dbl, float %flt) {
+; CHECK-LABEL: define void @test_tgamma(
+; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_tgamma_f64(double [[DBL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_tgamma_f32(float [[FLT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call double @tgamma(double %dbl)
+  %1 = call float @tgammaf(float %flt)
+  ret void
+}
+
+@globdbl = global double 4.200000e+01
+@globflt = global float 4.200000e+01
+
+define void @global_args() {
+; CHECK-LABEL: define void @global_args() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DBL:%.*]] = load double, ptr @globdbl, align 8
+; CHECK-NEXT:    [[FLT:%.*]] = load float, ptr @globflt, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @__hipstdpar_remquo_f64(double [[DBL]], double [[DBL]], ptr @globdbl)
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @__hipstdpar_remquo_f32(float [[FLT]], float [[FLT]], ptr @globflt)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dbl = load double, ptr @globdbl
+  %flt = load float, ptr @globflt
+  %1 = call double @remquo(double %dbl, double %dbl, ptr @globdbl)
+  %2 = call float @remquof(float %flt, float %flt, ptr @globflt)
+  ret void
+}
+
+declare hidden double @remainder(double, double)
+
+declare hidden float @remainderf(float, float)
+
+declare hidden double @remquo(double, double, ptr)
+
+declare hidden float @remquof(float, float, ptr)
+
+declare hidden double @fdim(double, double)
+
+declare hidden float @fdimf(float, float)
+
+declare double @llvm.exp.f64(double)
+
+declare float @llvm.exp.f32(float)
+
+declare double @llvm.exp2.f64(double)
+
+declare float @llvm.exp2.f32(float)
+
+declare hidden double @expm1(double)
+
+declare hidden float @expm1f(float)
+
+declare double @llvm.log.f64(double)
+
+declare double @llvm.log10.f64(double)
+
+declare double @llvm.log2.f64(double)
+
+declare hidden double @log1p(double)
+
+declare hidden float @log1pf(float)
+
+declare { float, float } @llvm.modf.f32(float)
+
+declare { double, double } @llvm.modf.f64(double)
+
+declare double @llvm.pow.f64(double, double)
+
+declare hidden double @cbrt(double)
+
+declare hidden float @cbrtf(float)
+
+declare hidden double @hypot(double, double)
+
+declare hidden float @hypotf(float, float)
+
+declare double @llvm.sin.f64(double)
+
+declare double @llvm.cos.f64(double)
+
+declare double @llvm.tan.f64(double)
+
+declare double @llvm.asin.f64(double)
+
+declare double @llvm.acos.f64(double)
+
+declare double @llvm.atan.f64(double)
+
+declare double @llvm.atan2.f64(double, double)
+
+declare double @llvm.sinh.f64(double)
+
+declare double @llvm.cosh.f64(double)
+
+declare double @llvm.tanh.f64(double)
+
+declare hidden double @asinh(double)
+
+declare hidden float @asinhf(float)
+
+declare hidden double @acosh(double)
+
+declare hidden float @acoshf(float)
+
+declare hidden double @atanh(double)
+
+declare hidden float @atanhf(float)
+
+declare hidden double @erf(double)
+
+declare hidden float @erff(float)
+
+declare hidden double @erfc(double)
+
+declare hidden float @erfcf(float)
+
+declare hidden double @tgamma(double)
+
+declare hidden float @tgammaf(float)
+
+declare hidden double @lgamma(double)
+
+declare hidden float @lgammaf(float)
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll
index 72003d2..b57a45f 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll
@@ -5,11 +5,11 @@
 ; However, if the call to @streaming_callee requires a streaming-mode change, it should always inline the call because the streaming-mode change is more expensive.
 target triple = "aarch64"
 
-declare void @streaming_compatible_f() "aarch64_pstate_sm_compatible"
+declare void @streaming_compatible_f() #0 "aarch64_pstate_sm_compatible"
 
 ; Function @streaming_callee doesn't contain any operations that may use ZA
 ; state and therefore can be legally inlined into a normal function.
-define void @streaming_callee() "aarch64_pstate_sm_enabled" {
+define void @streaming_callee() #0 "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define void @streaming_callee
 ; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    call void @streaming_compatible_f()
@@ -22,7 +22,7 @@ define void @streaming_callee() "aarch64_pstate_sm_enabled" {
 }
 
 ; Inline call to @streaming_callee to remove a streaming mode change.
-define void @non_streaming_caller_inline() {
+define void @non_streaming_caller_inline() #0 {
 ; CHECK-LABEL: define void @non_streaming_caller_inline
 ; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    call void @streaming_compatible_f()
@@ -34,7 +34,7 @@ define void @non_streaming_caller_inline() {
 }
 
 ; Don't inline call to @streaming_callee when the inline-threshold is set to 1, because it does not eliminate a streaming-mode change.
-define void @streaming_caller_dont_inline() "aarch64_pstate_sm_enabled" {
+define void @streaming_caller_dont_inline() #0 "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define void @streaming_caller_dont_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:    call void @streaming_callee()
@@ -43,3 +43,5 @@ define void @streaming_caller_dont_inline() "aarch64_pstate_sm_enabled" {
   call void @streaming_callee()
   ret void
 }
+
+attributes #0 = { "target-features"="+sve,+sme" }
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
index 7723e6c..6cb1692 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.vscale.i32()
 ; by the other functions below. If we see the call to one of these functions
 ; being replaced by 'llvm.vscale()', then we know it has been inlined.
 
-define i32 @normal_callee() {
+define i32 @normal_callee() #0 {
 ; CHECK-LABEL: define i32 @normal_callee
 ; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -19,7 +19,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @streaming_callee() "aarch64_pstate_sm_enabled" {
+define i32 @streaming_callee() #0 "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i32 @streaming_callee
 ; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -31,7 +31,7 @@ entry:
   ret i32 %res
 }
 
-define i32 @locally_streaming_callee() "aarch64_pstate_sm_body" {
+define i32 @locally_streaming_callee() #0 "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @locally_streaming_callee
 ; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -43,9 +43,9 @@ entry:
   ret i32 %res
 }
 
-define i32 @streaming_compatible_callee() "aarch64_pstate_sm_compatible" {
+define i32 @streaming_compatible_callee() #0 "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: define i32 @streaming_compatible_callee
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -55,9 +55,9 @@ entry:
   ret i32 %res
 }
 
-define i32 @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+define i32 @streaming_compatible_locally_streaming_callee() #0 "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_callee
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -84,9 +84,9 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define i32 @normal_caller_normal_callee_inline() {
+define i32 @normal_caller_normal_callee_inline() #0 {
 ; CHECK-LABEL: define i32 @normal_caller_normal_callee_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -101,9 +101,9 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define i32 @normal_caller_streaming_callee_dont_inline() {
+define i32 @normal_caller_streaming_callee_dont_inline() #0 {
 ; CHECK-LABEL: define i32 @normal_caller_streaming_callee_dont_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -118,9 +118,9 @@ entry:
 ; [x] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define i32 @normal_caller_streaming_compatible_callee_inline() {
+define i32 @normal_caller_streaming_compatible_callee_inline() #0  {
 ; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_callee_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -135,9 +135,9 @@ entry:
 ; [ ] N  -> SC
 ; [x] N  -> N + B
 ; [ ] N  -> SC + B
-define i32 @normal_caller_locally_streaming_callee_dont_inline() {
+define i32 @normal_caller_locally_streaming_callee_dont_inline() #0  {
 ; CHECK-LABEL: define i32 @normal_caller_locally_streaming_callee_dont_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @locally_streaming_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -152,9 +152,9 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [x] N  -> SC + B
-define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() {
+define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() #0  {
 ; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -169,9 +169,9 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define i32 @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" {
+define i32 @streaming_caller_normal_callee_dont_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i32 @streaming_caller_normal_callee_dont_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -186,9 +186,9 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define i32 @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+define i32 @streaming_caller_streaming_callee_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i32 @streaming_caller_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -203,9 +203,9 @@ entry:
 ; [x] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define i32 @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" {
+define i32 @streaming_caller_streaming_compatible_callee_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_callee_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -220,9 +220,9 @@ entry:
 ; [ ] S  -> SC
 ; [x] S  -> N + B
 ; [ ] S  -> SC + B
-define i32 @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+define i32 @streaming_caller_locally_streaming_callee_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i32 @streaming_caller_locally_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -237,9 +237,9 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [x] S  -> SC + B
-define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -254,9 +254,9 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define i32 @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" {
+define i32 @locally_streaming_caller_normal_callee_dont_inline() #0  "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @locally_streaming_caller_normal_callee_dont_inline
-; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -271,9 +271,9 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define i32 @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" {
+define i32 @locally_streaming_caller_streaming_callee_inline() #0  "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-SAME: () #[[ATTR8]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -288,9 +288,9 @@ entry:
 ; [x] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define i32 @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" {
+define i32 @locally_streaming_caller_streaming_compatible_callee_inline() #0  "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_callee_inline
-; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-SAME: () #[[ATTR8]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -305,9 +305,9 @@ entry:
 ; [ ] N + B -> SC
 ; [x] N + B -> N + B
 ; [ ] N + B -> SC + B
-define i32 @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
+define i32 @locally_streaming_caller_locally_streaming_callee_inline() #0  "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @locally_streaming_caller_locally_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-SAME: () #[[ATTR8]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -322,9 +322,9 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [x] N + B -> SC + B
-define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
+define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() #0  "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-SAME: () #[[ATTR8]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -339,9 +339,9 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define i32 @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+define i32 @streaming_compatible_caller_normal_callee_dont_inline() #0  "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: define i32 @streaming_compatible_caller_normal_callee_dont_inline
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR9:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -356,9 +356,9 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define i32 @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+define i32 @streaming_compatible_caller_streaming_callee_dont_inline() #0  "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_callee_dont_inline
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR9]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -373,9 +373,9 @@ entry:
 ; [x] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define i32 @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" {
+define i32 @streaming_compatible_caller_streaming_compatible_callee_inline() #0  "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_callee_inline
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR9]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -390,9 +390,9 @@ entry:
 ; [ ] SC -> SC
 ; [x] SC -> N + B
 ; [ ] SC -> SC + B
-define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline() #0  "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR9]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @locally_streaming_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -407,9 +407,9 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [x] SC -> SC + B
-define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() #0  "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR9]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -423,9 +423,9 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() #0  "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
 ; CHECK-NEXT:    ret i32 [[RES]]
@@ -440,9 +440,9 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline() #0  "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR10]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -457,9 +457,9 @@ entry:
 ; [x] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() #0  "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR10]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -474,9 +474,9 @@ entry:
 ; [ ] SC + B -> SC
 ; [x] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() #0  "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR10]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -491,9 +491,9 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [x] SC + B -> SC + B
-define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline() #0  "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR10]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    ret i32 [[RES_I]]
@@ -503,9 +503,9 @@ entry:
   ret i32 %res
 }
 
-define void @normal_callee_with_inlineasm() {
+define void @normal_callee_with_inlineasm() #0  {
 ; CHECK-LABEL: define void @normal_callee_with_inlineasm
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    ret void
@@ -515,9 +515,9 @@ entry:
   ret void
 }
 
-define void @streaming_caller_normal_callee_with_inlineasm_dont_inline() "aarch64_pstate_sm_enabled" {
+define void @streaming_caller_normal_callee_with_inlineasm_dont_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define void @streaming_caller_normal_callee_with_inlineasm_dont_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @normal_callee_with_inlineasm()
 ; CHECK-NEXT:    ret void
@@ -527,9 +527,9 @@ entry:
   ret void
 }
 
-define i64 @normal_callee_with_intrinsic_call() {
+define i64 @normal_callee_with_intrinsic_call() #0  {
 ; CHECK-LABEL: define i64 @normal_callee_with_intrinsic_call
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.aarch64.sve.cntb(i32 4)
 ; CHECK-NEXT:    ret i64 [[RES]]
@@ -539,9 +539,9 @@ entry:
   ret i64 %res
 }
 
-define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline() "aarch64_pstate_sm_enabled" {
+define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @normal_callee_with_intrinsic_call()
 ; CHECK-NEXT:    ret i64 [[RES]]
@@ -553,9 +553,9 @@ entry:
 
 declare i64 @llvm.aarch64.sve.cntb(i32)
 
-define i64 @normal_callee_call_sme_state() {
+define i64 @normal_callee_call_sme_state() #0  {
 ; CHECK-LABEL: define i64 @normal_callee_call_sme_state
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call { i64, i64 } @__arm_sme_state()
 ; CHECK-NEXT:    [[RES_0:%.*]] = extractvalue { i64, i64 } [[RES]], 0
@@ -569,9 +569,9 @@ entry:
 
 declare {i64, i64} @__arm_sme_state()
 
-define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline() "aarch64_pstate_sm_enabled" {
+define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @normal_callee_call_sme_state()
 ; CHECK-NEXT:    ret i64 [[RES]]
@@ -585,9 +585,9 @@ entry:
 
 declare void @streaming_body() "aarch64_pstate_sm_enabled"
 
-define void @streaming_caller_single_streaming_callee() "aarch64_pstate_sm_enabled" {
+define void @streaming_caller_single_streaming_callee() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define void @streaming_caller_single_streaming_callee
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:    call void @streaming_body()
 ; CHECK-NEXT:    ret void
 ;
@@ -595,9 +595,9 @@ define void @streaming_caller_single_streaming_callee() "aarch64_pstate_sm_enabl
   ret void
 }
 
-define void @streaming_caller_multiple_streaming_callees() "aarch64_pstate_sm_enabled" {
+define void @streaming_caller_multiple_streaming_callees() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define void @streaming_caller_multiple_streaming_callees
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:    call void @streaming_body()
 ; CHECK-NEXT:    call void @streaming_body()
 ; CHECK-NEXT:    ret void
@@ -608,9 +608,9 @@ define void @streaming_caller_multiple_streaming_callees() "aarch64_pstate_sm_en
 }
 
 ; Allow inlining, as inline it would not increase the number of streaming-mode changes.
-define void @streaming_caller_single_streaming_callee_inline() {
+define void @streaming_caller_single_streaming_callee_inline() #0  {
 ; CHECK-LABEL: define void @streaming_caller_single_streaming_callee_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:    call void @streaming_body()
 ; CHECK-NEXT:    ret void
 ;
@@ -619,9 +619,9 @@ define void @streaming_caller_single_streaming_callee_inline() {
 }
 
 ; Prevent inlining, as inline it would lead to multiple streaming-mode changes.
-define void @streaming_caller_multiple_streaming_callees_dont_inline() {
+define void @streaming_caller_multiple_streaming_callees_dont_inline() #0  {
 ; CHECK-LABEL: define void @streaming_caller_multiple_streaming_callees_dont_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:    call void @streaming_caller_multiple_streaming_callees()
 ; CHECK-NEXT:    ret void
 ;
@@ -631,9 +631,9 @@ define void @streaming_caller_multiple_streaming_callees_dont_inline() {
 
 declare void @streaming_compatible_body() "aarch64_pstate_sm_compatible"
 
-define void @streaming_caller_single_streaming_compatible_callee() "aarch64_pstate_sm_enabled" {
+define void @streaming_caller_single_streaming_compatible_callee() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define void @streaming_caller_single_streaming_compatible_callee
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:    call void @streaming_compatible_body()
 ; CHECK-NEXT:    ret void
 ;
@@ -641,9 +641,9 @@ define void @streaming_caller_single_streaming_compatible_callee() "aarch64_psta
   ret void
 }
 
-define void @streaming_caller_multiple_streaming_compatible_callees() "aarch64_pstate_sm_enabled" {
+define void @streaming_caller_multiple_streaming_compatible_callees() #0  "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: define void @streaming_caller_multiple_streaming_compatible_callees
-; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-SAME: () #[[ATTR7]] {
 ; CHECK-NEXT:    call void @streaming_compatible_body()
 ; CHECK-NEXT:    call void @streaming_compatible_body()
 ; CHECK-NEXT:    ret void
@@ -654,9 +654,9 @@ define void @streaming_caller_multiple_streaming_compatible_callees() "aarch64_p
 }
 
 ; Allow inlining, as inline would remove a streaming-mode change.
-define void @streaming_caller_single_streaming_compatible_callee_inline() {
+define void @streaming_caller_single_streaming_compatible_callee_inline() #0  {
 ; CHECK-LABEL: define void @streaming_caller_single_streaming_compatible_callee_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:    call void @streaming_compatible_body()
 ; CHECK-NEXT:    ret void
 ;
@@ -665,9 +665,9 @@ define void @streaming_caller_single_streaming_compatible_callee_inline() {
 }
 
 ; Allow inlining, as inline would remove several stremaing-mode changes.
-define void @streaming_caller_multiple_streaming_compatible_callees_inline() {
+define void @streaming_caller_multiple_streaming_compatible_callees_inline() #0  {
 ; CHECK-LABEL: define void @streaming_caller_multiple_streaming_compatible_callees_inline
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR6]] {
 ; CHECK-NEXT:    call void @streaming_compatible_body()
 ; CHECK-NEXT:    call void @streaming_compatible_body()
 ; CHECK-NEXT:    ret void
@@ -675,3 +675,5 @@ define void @streaming_caller_multiple_streaming_compatible_callees_inline() {
   call void @streaming_caller_multiple_streaming_compatible_callees()
   ret void
 }
+
+attributes #0 = { "target-features"="+sve,+sme" }
diff --git a/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll b/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll
new file mode 100644
index 0000000..c637481
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/recurrence-binary-intrinsic.ll
@@ -0,0 +1,394 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i8 @simple_recurrence_intrinsic_smax(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_smax(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SMAX_ACC:%.*]] = phi i8 [ [[SMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[SMAX]] = call i8 @llvm.smax.i8(i8 [[SMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[SMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %smax.acc = phi i8 [ %smax, %loop ], [ %a, %entry ]
+  %smax = call i8 @llvm.smax.i8(i8 %smax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %smax
+}
+
+define i8 @simple_recurrence_intrinsic_smin(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_smin(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SMIN_ACC:%.*]] = phi i8 [ [[SMIN:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[SMIN]] = call i8 @llvm.smin.i8(i8 [[SMIN_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[SMIN]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %smin.acc = phi i8 [ %smin, %loop ], [ %a, %entry ]
+  %smin = call i8 @llvm.smin.i8(i8 %smin.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %smin
+}
+
+define i8 @simple_recurrence_intrinsic_umax(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_umax(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX_ACC:%.*]] = phi i8 [ [[UMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX]] = call i8 @llvm.umax.i8(i8 [[UMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+define i8 @simple_recurrence_intrinsic_umin(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_umin(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMIN_ACC:%.*]] = phi i8 [ [[UMIN:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMIN]] = call i8 @llvm.umin.i8(i8 [[UMIN_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMIN]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umin.acc = phi i8 [ %umin, %loop ], [ %a, %entry ]
+  %umin = call i8 @llvm.umin.i8(i8 %umin.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umin
+}
+
+define float @simple_recurrence_intrinsic_maxnum(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_maxnum(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMAX_ACC:%.*]] = phi float [ [[FMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMAX]] = call float @llvm.maxnum.f32(float [[FMAX_ACC]], float [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float [[FMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmax.acc = phi float [ %fmax, %loop ], [ %a, %entry ]
+  %fmax = call float @llvm.maxnum.f32(float %fmax.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmax
+}
+
+define float @simple_recurrence_intrinsic_minnum(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_minnum(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMIN_ACC:%.*]] = phi float [ [[FMIN:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMIN]] = call float @llvm.minnum.f32(float [[FMIN_ACC]], float [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float [[FMIN]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmin.acc = phi float [ %fmin, %loop ], [ %a, %entry ]
+  %fmin = call float @llvm.minnum.f32(float %fmin.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmin
+}
+
+define float @simple_recurrence_intrinsic_maximum(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_maximum(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMAX_ACC:%.*]] = phi float [ [[FMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMAX]] = call nnan float @llvm.maximum.f32(float [[FMAX_ACC]], float [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float [[FMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmax.acc = phi float [ %fmax, %loop ], [ %a, %entry ]
+  %fmax = call nnan float @llvm.maximum.f32(float %fmax.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmax
+}
+
+define float @simple_recurrence_intrinsic_minimum(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_minimum(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMIN_ACC:%.*]] = phi float [ [[FMIN:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMIN]] = call nnan float @llvm.minimum.f32(float [[FMIN_ACC]], float [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float [[FMIN]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmin.acc = phi float [ %fmin, %loop ], [ %a, %entry ]
+  %fmin = call nnan float @llvm.minimum.f32(float %fmin.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmin
+}
+
+define float @simple_recurrence_intrinsic_maximumnum(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_maximumnum(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMAX_ACC:%.*]] = phi float [ [[FMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMAX]] = call nnan float @llvm.maximumnum.f32(float [[FMAX_ACC]], float [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float [[FMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmax.acc = phi float [ %fmax, %loop ], [ %a, %entry ]
+  %fmax = call nnan float @llvm.maximumnum.f32(float %fmax.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmax
+}
+
+define float @simple_recurrence_intrinsic_minimumnum(i32 %n, float %a, float %b) {
+; CHECK-LABEL: define float @simple_recurrence_intrinsic_minimumnum(
+; CHECK-SAME: i32 [[N:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMIN_ACC:%.*]] = phi float [ [[FMIN:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[FMIN]] = call nnan float @llvm.minimumnum.f32(float [[FMIN_ACC]], float [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float [[FMIN]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32  [ %iv.next, %loop ], [ 0, %entry ]
+  %fmin.acc = phi float [ %fmin, %loop ], [ %a, %entry ]
+  %fmin = call nnan float @llvm.minimumnum.f32(float %fmin.acc, float %b)
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+exit:
+  ret float %fmin
+}
+
+define i8 @simple_recurrence_intrinsic_multiuse_phi(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_multiuse_phi(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX_ACC:%.*]] = phi i8 [ [[UMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    call void @use(i8 [[UMAX_ACC]])
+; CHECK-NEXT:    [[UMAX]] = call i8 @llvm.umax.i8(i8 [[UMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  call void @use(i8 %umax.acc)
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+; Negative tests.
+
+define i8 @simple_recurrence_intrinsic_uadd_sat(i8 %n, i8 %a, i8 %b) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_uadd_sat(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UADD_SAT_ACC:%.*]] = phi i8 [ [[UADD_SAT:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[UADD_SAT]] = call i8 @llvm.uadd.sat.i8(i8 [[UADD_SAT_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UADD_SAT]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %uadd.sat.acc = phi i8 [ %uadd.sat, %loop ], [ %a, %entry ]
+  %uadd.sat = call i8 @llvm.uadd.sat.i8(i8 %uadd.sat.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %uadd.sat
+}
+
+define i8 @simple_recurrence_intrinsic_arg_loop_variant(i8 %n, i8 %a) {
+; CHECK-LABEL: define i8 @simple_recurrence_intrinsic_arg_loop_variant(
+; CHECK-SAME: i8 [[N:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[UMAX_ACC:%.*]] = phi i8 [ [[UMAX:%.*]], %[[LOOP]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[B:%.*]] = xor i8 [[IV]], 42
+; CHECK-NEXT:    [[UMAX]] = call i8 @llvm.umax.i8(i8 [[UMAX_ACC]], i8 [[B]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i8 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i8 [[UMAX]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i8 [ %iv.next, %loop ], [ 0, %entry ]
+  %umax.acc = phi i8 [ %umax, %loop ], [ %a, %entry ]
+  %b = xor i8 %iv, 42
+  %umax = call i8 @llvm.umax.i8(i8 %umax.acc, i8 %b)
+  %iv.next = add nuw i8 %iv, 1
+  %cmp = icmp ult i8 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i8 %umax
+}
+
+declare void @use(i8)
diff --git a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
index 33fa2c3..f83352c 100644
--- a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
@@ -959,8 +959,8 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) {
 
 define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) {
 ; CHECK-LABEL: @wide_lengthening_splat(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i16> [[V:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[TR:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i8> [[TR]]
 ;
   %shuf = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll
index a85ce71..dfe9d94 100644
--- a/llvm/test/Transforms/InstCombine/trunc.ll
+++ b/llvm/test/Transforms/InstCombine/trunc.ll
@@ -960,8 +960,8 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) {
 
 define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) {
 ; CHECK-LABEL: @wide_lengthening_splat(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i16> [[V:%.*]] to <4 x i8>
+; CHECK-NEXT:    [[TR:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i8> [[TR]]
 ;
   %shuf = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
@@ -969,6 +969,19 @@ define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) {
   ret <8 x i8> %tr
 }
 
+; This is a negative test, we expect the trunc to remain after the shuffle as it
+; might not be beneficial to preform trunc on a wider type
+define <4 x i8> @wide_shortening_splat(<8 x i16> %v) {
+; CHECK-LABEL: @wide_shortening_splat(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TR:%.*]] = trunc <4 x i16> [[SHUF]] to <4 x i8>
+; CHECK-NEXT:    ret <4 x i8> [[TR]]
+;
+  %shuf = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+  %tr = trunc <4 x i16> %shuf to <4 x i8>
+  ret <4 x i8> %tr
+}
+
 define <2 x i8> @narrow_add_vec_constant(<2 x i32> %x) {
 ; CHECK-LABEL: @narrow_add_vec_constant(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8>
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
index 7632579..5b24906 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
@@ -29,3 +29,31 @@ define ptr @f2() {
 ;
   ret ptr getelementptr (ptr, ptr getelementptr inbounds inrange(0, 8) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 3)
 }
+
+define ptr @f3() {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:    ret ptr getelementptr inbounds nuw inrange(-8, 0) (i8, ptr @vt, i64 16)
+;
+  ret ptr getelementptr inrange(-16, 8) (ptr, ptr getelementptr inbounds inrange(0, 8) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 1)
+}
+
+define ptr @f4() {
+; CHECK-LABEL: @f4(
+; CHECK-NEXT:    ret ptr getelementptr inbounds nuw inrange(-8, 8) (i8, ptr @vt, i64 16)
+;
+  ret ptr getelementptr inrange(-16, 8) (ptr, ptr getelementptr inbounds inrange(0, 24) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 1)
+}
+
+define ptr @f5() {
+; CHECK-LABEL: @f5(
+; CHECK-NEXT:    ret ptr getelementptr inbounds nuw inrange(0, 0) (i8, ptr @vt, i64 16)
+;
+  ret ptr getelementptr inrange(0, 8) (ptr, ptr getelementptr inbounds inrange(0, 8) ([3 x ptr], ptr @vt, i64 0, i64 0), i64 2)
+}
+
+define ptr @f6() {
+; CHECK-LABEL: @f6(
+; CHECK-NEXT:    ret ptr getelementptr inbounds nuw inrange(-8, 8) (i8, ptr @vt, i64 16)
+;
+  ret ptr getelementptr inrange(-8, 8) (ptr, ptr getelementptr inbounds inrange(-8, 16) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 1)
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index 672e949..b505917 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -874,3 +874,79 @@ define void @load_factor2_fp128(ptr %ptr) {
   %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
   ret void
 }
+
+define void @load_factor2_f32(ptr %ptr) {
+; RV32-LABEL: @load_factor2_f32(
+; RV32-NEXT:    [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8)
+; RV32-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_f32(
+; RV64-NEXT:    [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8)
+; RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1
+; RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x float>, ptr %ptr
+  %v0 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_f64(ptr %ptr) {
+; RV32-LABEL: @load_factor2_f64(
+; RV32-NEXT:    [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8)
+; RV32-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_f64(
+; RV64-NEXT:    [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8)
+; RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
+; RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x double>, ptr %ptr
+  %v0 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_bf16(ptr %ptr) {
+; RV32-LABEL: @load_factor2_bf16(
+; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32
+; RV32-NEXT:    [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV32-NEXT:    [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_bf16(
+; RV64-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32
+; RV64-NEXT:    [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV64-NEXT:    [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x bfloat>, ptr %ptr
+  %v0 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_f16(ptr %ptr) {
+; RV32-LABEL: @load_factor2_f16(
+; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32
+; RV32-NEXT:    [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV32-NEXT:    [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_f16(
+; RV64-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32
+; RV64-NEXT:    [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV64-NEXT:    [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x half>, ptr %ptr
+  %v0 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
new file mode 100644
index 0000000..36c6bdd
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/sunk-phi-nodes.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-fusion -S < %s 2>&1 | FileCheck %s
+define dso_local i32 @check_sunk_phi_nodes() {
+; CHECK-LABEL: define dso_local i32 @check_sunk_phi_nodes() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[SUM1_02:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_INC6:.*]] ]
+; CHECK-NEXT:    [[I_01:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[I1_04:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC7:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[SUM2_03:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD5:%.*]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM1_02]], [[I_01]]
+; CHECK-NEXT:    br label %[[FOR_INC:.*]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I1_04]], [[I1_04]]
+; CHECK-NEXT:    [[ADD5]] = add nsw i32 [[SUM2_03]], [[MUL]]
+; CHECK-NEXT:    br label %[[FOR_INC6]]
+; CHECK:       [[FOR_INC6]]:
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_01]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 10
+; CHECK-NEXT:    [[INC7]] = add nsw i32 [[I1_04]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[INC7]], 10
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY]], label %[[FOR_END8:.*]]
+; CHECK:       [[FOR_END8]]:
+; CHECK-NEXT:    [[SUM2_0_LCSSA:%.*]] = phi i32 [ [[ADD5]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[SUM1_0_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_INC6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SUM1_0_LCSSA]], [[SUM2_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %sum1.02 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %add = add nsw i32 %sum1.02, %i.01
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %sum1.0.lcssa = phi i32 [ %add, %for.inc ]
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.end, %for.inc6
+  %i1.04 = phi i32 [ 0, %for.end ], [ %inc7, %for.inc6 ]
+  %sum2.03 = phi i32 [ 0, %for.end ], [ %add5, %for.inc6 ]
+  %mul = mul nsw i32 %i1.04, %i1.04
+  %add5 = add nsw i32 %sum2.03, %mul
+  br label %for.inc6
+
+for.inc6:                                         ; preds = %for.body4
+  %inc7 = add nsw i32 %i1.04, 1
+  %cmp3 = icmp slt i32 %inc7, 10
+  br i1 %cmp3, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.inc6
+  %sum2.0.lcssa = phi i32 [ %add5, %for.inc6 ]
+  %0 = add i32 %sum1.0.lcssa, %sum2.0.lcssa
+  ret i32 %0
+}
+
diff --git a/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll
new file mode 100644
index 0000000..65aaf72
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-idiom -S %s | FileCheck %s
+
+declare void @foo()
+declare void @bar()
+
+define void @scev_expand_ptrtoint(i8 %x, ptr %start) {
+; CHECK-LABEL: define void @scev_expand_ptrtoint(
+; CHECK-SAME: i8 [[X:%.*]], ptr [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[START1:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
+; CHECK:       [[LOOP_1_HEADER]]:
+; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1_LATCH:.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ule i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_1_LATCH]], label %[[MIDDLE:.*]]
+; CHECK:       [[LOOP_1_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 1
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER]]
+; CHECK:       [[MIDDLE]]:
+; CHECK-NEXT:    [[PTR_IV_1_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1]], %[[LOOP_1_HEADER]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[CMP_EXT:%.*]] = zext i1 [[CMP]] to i64
+; CHECK-NEXT:    [[GEP_START:%.*]] = getelementptr i8, ptr [[PTR_IV_1_LCSSA]], i64 [[CMP_EXT]]
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
+; CHECK:       [[LOOP_2_HEADER]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], %[[LOOP_2_LATCH:.*]] ], [ 0, %[[MIDDLE]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[GEP_START]], %[[MIDDLE]] ], [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ]
+; CHECK-NEXT:    switch i8 [[X]], label %[[LOOP_2_LATCH]] [
+; CHECK-NEXT:      i8 1, label %[[LOOP_3_PREHEADER:.*]]
+; CHECK-NEXT:      i8 4, label %[[LOOP_3_PREHEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       [[LOOP_3_PREHEADER]]:
+; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], %[[LOOP_2_HEADER]] ], [ [[INDVAR]], %[[LOOP_2_HEADER]] ]
+; CHECK-NEXT:    [[PTR_IV_2_LCSSA:%.*]] = phi ptr [ [[PTR_IV_2]], %[[LOOP_2_HEADER]] ], [ [[PTR_IV_2]], %[[LOOP_2_HEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[START1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[CMP_EXT]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDVAR_LCSSA]], [[TMP4]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[SCEVGEP]])
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_2_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i64 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i64 [ [[IV_3_NEXT:%.*]], %[[LOOP_3]] ], [ 1, %[[LOOP_3_PREHEADER]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[PTR_IV_2_LCSSA]], i64 [[IV_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[TMP6]], 0
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i64 [[IV_3]], 1
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_3]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1.header
+
+loop.1.header:
+  %ptr.iv.1 = phi ptr [ %start, %entry ], [ %ptr.iv.1.next, %loop.1.latch ]
+  %c = icmp ule i8 %x, 1
+  br i1 %c, label %loop.1.latch, label %middle
+
+loop.1.latch:
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 1
+  br label %loop.1.header
+
+middle:
+  %cmp = icmp eq i8 %x, 0
+  %cmp.ext = zext i1 %cmp to i64
+  %gep.start = getelementptr i8, ptr %ptr.iv.1, i64 %cmp.ext
+  br label %loop.2.header
+
+loop.2.header:
+  %ptr.iv.2 = phi ptr [ %gep.start, %middle ], [ %ptr.iv.2.next, %loop.2.latch ]
+  switch i8 %x, label %loop.2.latch [
+  i8 1, label %loop.3
+  i8 4, label %loop.3
+  ]
+
+loop.2.latch:
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 1
+  br label %loop.2.header
+
+loop.3:
+  %iv.3 = phi i64 [ 1, %loop.2.header ], [ 1, %loop.2.header ], [ %iv.3.next, %loop.3 ]
+  %gep = getelementptr i8, ptr %ptr.iv.2, i64 %iv.3
+  %1 = load i8, ptr %gep, align 1
+  %ec = icmp eq i8 %1, 0
+  %iv.3.next = add i64 %iv.3, 1
+  br i1 %ec, label %exit, label %loop.3
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define ptr @test_lcssa_reuse_preserve_lcssa() {
+; CHECK-LABEL: define ptr @test_lcssa_reuse_preserve_lcssa() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_0_HEADER:.*]]
+; CHECK:       [[LOOP_0_HEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi ptr [ null, %[[LOOP_0_HEADER]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_1_NEXT]] = getelementptr i8, ptr [[IV_1]], i64 1
+; CHECK-NEXT:    [[EC_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[EC_1]], label %[[THEN:.*]], label %[[LOOP_1]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[IV_1_LCSSA1:%.*]] = phi ptr [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_2]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_0_LATCH:.*]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi ptr [ [[IV_1_LCSSA1]], %[[THEN]] ]
+; CHECK-NEXT:    [[IV_1_LCSSA_LCSSA:%.*]] = phi ptr [ [[IV_1_LCSSA1]], %[[THEN]] ]
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr null)
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[IV_1_LCSSA]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[STRLEN]]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi ptr [ [[RES:%.*]], %[[LOOP_2]] ], [ [[IV_1_LCSSA_LCSSA]], %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[RES]] = getelementptr i8, ptr [[IV_2]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[IV_1_LCSSA_LCSSA]], align 1
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i8 [[L]], 0
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_2]]
+; CHECK:       [[LOOP_0_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_0_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret ptr [[SCEVGEP]]
+;
+entry:
+  br label %loop.0.header
+
+loop.0.header:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi ptr [ null, %loop.0.header ], [ %iv.1.next, %loop.1 ]
+  %iv.1.next = getelementptr i8, ptr %iv.1, i64 1
+  %ec.1 = call i1 @cond()
+  br i1 %ec.1, label %then, label %loop.1
+
+then:
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %loop.2, label %loop.0.latch
+
+loop.2:
+  %iv.2 = phi ptr [ %res, %loop.2 ], [ %iv.1, %then ]
+  %res = getelementptr i8, ptr %iv.2, i64 1
+  %l = load i8, ptr %iv.1, align 1
+  %ec.2 = icmp eq i8 %l, 0
+  br i1 %ec.2, label %exit, label %loop.2
+
+loop.0.latch:
+  br label %loop.0.header
+
+exit:
+  ret ptr %res
+}
diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
index 9c113d4..4194849 100644
--- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
+++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -5,6 +5,8 @@
 @A = dso_local global [256 x [256 x float]] zeroinitializer
 @B = dso_local global [256 x [256 x float]] zeroinitializer
 @C = dso_local global [256 x [256 x float]] zeroinitializer
+@D = global [256 x [256 x [256 x float]]] zeroinitializer
+@E = global [256 x [256 x [256 x float]]] zeroinitializer
 
 ; Check that the below loops are exchanged for vectorization.
 ;
@@ -64,15 +66,13 @@ exit:
 ;   for (int j = 1; j < 256; j++)
 ;     A[i][j-1] = A[i][j] + B[i][j];
 ;
-; FIXME: These loops are exchanged at this time due to the problem in
-; profitability heuristic calculation for vectorization.
 
-; CHECK:      --- !Passed
+; CHECK:      --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        interchange_unnecesasry_for_vectorization
 ; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:   - String:          Insufficient information to calculate the cost of loop for interchange.
 define void @interchange_unnecesasry_for_vectorization() {
 entry:
   br label %for.i.header
@@ -103,3 +103,134 @@ for.i.inc:
 exit:
   ret void
 }
+
+; Check that the below loops are exchanged to allow innermost loop
+; vectorization. We cannot vectorize the j-loop because it has a lexically
+; backward dependency, but the i-loop can be vectorized because all the
+; loop-carried dependencies are lexically forward. LoopVectorize currently only
+; vectorizes innermost loop, hence move the i-loop to that position.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 1; j < 256; j++) {
+;     A[i][j] = A[i][j-1] + B[i][j];
+;     C[i][j] += C[i+1][j];
+;   }
+; }
+;
+
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_necessary_for_vectorization2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+define void @interchange_necessary_for_vectorization2() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
+  %i.inc = add i64 %i, 1
+  br label %for.j.body
+
+for.j.body:
+  %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+  %j.dec = add i64 %j, -1
+  %a.load.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j.dec
+  %b.index = getelementptr [256 x [256 x float]], ptr @B, i64 0, i64 %i, i64 %j
+  %c.load.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i.inc, i64 %j
+  %c.store.index = getelementptr [256 x [256 x float]], ptr @C, i64 0, i64 %i, i64 %j
+  %a = load float, ptr %a.load.index
+  %b = load float, ptr %b.index
+  %c0 = load float, ptr %c.load.index
+  %c1 = load float, ptr %c.store.index
+  %add.0 = fadd float %a, %b
+  %a.store.index = getelementptr [256 x [256 x float]], ptr @A, i64 0, i64 %i, i64 %j
+  store float %add.0, ptr %a.store.index
+  %add.1 = fadd float %c0, %c1
+  store float %add.1, ptr %c.store.index
+  %j.next = add i64 %j, 1
+  %cmp.j = icmp eq i64 %j.next, 256
+  br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+  %i.next = add i64 %i, 1
+  %cmp.i = icmp eq i64 %i.next, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
+
+; Check that no interchange is performed for the following loop. Interchanging
+; the j-loop and k-loop makes the innermost loop vectorizble, since the j-loop
+; has only forward dependencies. However, at the moment, a loop body consisting
+; of multiple BBs is handled pesimistically. Hence the j-loop isn't moved to
+; the innermost place.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 0; j < 255; j++) {
+;     for (int k = 0; k < 128; k++) {
+;       E[i][j][k] = D[i+1][j+1][2*k];
+;       if (cond)
+;         D[i][j][k+1] = 1.0;
+;   }
+; }
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+define void @multiple_BBs_in_loop() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %i.inc = add i64 %i, 1
+  br label %for.j.header
+
+for.j.header:
+  %j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
+  %j.inc = add i64 %j, 1
+  br label %for.k.body
+
+for.k.body:
+  %k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
+  %k.inc = add i64 %k, 1
+  %k.2 = mul i64 %k, 2
+  %d.index = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i.inc, i64 %j.inc, i64 %k.2
+  %e.index = getelementptr [256 x [256 x [256 x float]]], ptr @E, i64 0, i64 %i, i64 %j, i64 %k
+  %d.load = load float, ptr %d.index
+  store float %d.load, ptr %e.index
+  %cond = freeze i1 undef
+  br i1 %cond, label %if.then, label %for.k.inc
+
+if.then:
+  %d.index2 = getelementptr [256 x [256 x [256 x float]]], ptr @D, i64 0, i64 %i, i64 %j, i64 %k.inc
+  store float 1.0, ptr %d.index2
+  br label %for.k.inc
+
+for.k.inc:
+  %cmp.k = icmp eq i64 %k.inc, 128
+  br i1 %cmp.k, label %for.j.inc, label %for.k.body
+
+for.j.inc:
+  %cmp.j = icmp eq i64 %j.inc, 255
+  br i1 %cmp.j, label %for.i.inc, label %for.j.header
+
+for.i.inc:
+  %cmp.i = icmp eq i64 %i.inc, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll b/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
index 10e1065..037851f 100644
--- a/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
+++ b/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
@@ -59,19 +59,14 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_1]], i64 1
 ; CHECK-NEXT:    br label [[INNER_2:%.*]]
 ; CHECK:       inner.2:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[INNER_2]] ], [ 0, [[INNER_1_EXIT]] ]
 ; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[GEP_5]], [[INNER_1_EXIT]] ], [ [[PTR_IV_2_NEXT:%.*]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr inbounds double, ptr [[PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 false, label [[INNER_3_LVER_CHECK:%.*]], label [[INNER_2]]
 ; CHECK:       inner.3.lver.check:
-; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[LCSSA_PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds double, ptr [[PTR_PHI]], i64 1
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVAR_LCSSA]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 24
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_1]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_2]], i64 16
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[GEP_7]], [[GEP_1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR_PHI]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
@@ -104,7 +99,7 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK-NEXT:    br i1 [[C_2]], label [[OUTER_LATCH_LOOPEXIT4:%.*]], label [[INNER_3]]
 ; CHECK:       outer.latch.loopexit:
 ; CHECK-NEXT:    br label [[OUTER_LATCH]]
-; CHECK:       outer.latch.loopexit4:
+; CHECK:       outer.latch.loopexit3:
 ; CHECK-NEXT:    br label [[OUTER_LATCH]]
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    br label [[INNER_1_LVER_CHECK]]
diff --git a/llvm/test/Transforms/LoopLoadElim/versioning-scev-invalidation.ll b/llvm/test/Transforms/LoopLoadElim/versioning-scev-invalidation.ll
index 7472680..abad40d 100644
--- a/llvm/test/Transforms/LoopLoadElim/versioning-scev-invalidation.ll
+++ b/llvm/test/Transforms/LoopLoadElim/versioning-scev-invalidation.ll
@@ -65,8 +65,7 @@ define void @g(ptr %dst.1, ptr %start, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[NEXT_GEP]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -84,7 +83,7 @@ define void @g(ptr %dst.1, ptr %start, i64 %N) {
 ; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr inbounds double, ptr [[PTR_IV_2]], i64 1
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 1
 ; CHECK-NEXT:    [[EXITCOND_1_NOT:%.*]] = icmp eq i64 [[IV_2_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_1_NOT]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_1_NOT]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_2]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll
new file mode 100644
index 0000000..f74fb14
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -S %s | FileCheck %s
+
+target triple = "hexagon-unknown-linux"
+
+declare void @foo()
+
+define void @preserve_lcssa_when_reusing_existing_phi() {
+; CHECK-LABEL: define void @preserve_lcssa_when_reusing_existing_phi() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
+; CHECK:       [[LOOP_1_HEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
+; CHECK:       [[LOOP_2_HEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], %[[LOOP_3]] ], [ 0, %[[LOOP_2_HEADER]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    br i1 false, label %[[PH:.*]], label %[[LOOP_3]]
+; CHECK:       [[PH]]:
+; CHECK-NEXT:    [[IV_3_LCSSA:%.*]] = phi i32 [ [[IV_3]], %[[LOOP_3]] ]
+; CHECK-NEXT:    br i1 true, label %[[LOOP_2_LATCH:.*]], label %[[LOOP_4_PREHEADER:.*]]
+; CHECK:       [[LOOP_4_PREHEADER]]:
+; CHECK-NEXT:    [[IV_3_LCSSA_LCSSA1:%.*]] = phi i32 [ [[IV_3_LCSSA]], %[[PH]] ]
+; CHECK-NEXT:    [[IV_3_LCSSA_LCSSA:%.*]] = phi i32 [ [[IV_3_LCSSA]], %[[PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV_3_LCSSA_LCSSA1]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[LOOP_1_LATCH_UNR_LCSSA:.*]], label %[[LOOP_4_PREHEADER_NEW:.*]]
+; CHECK:       [[LOOP_4_PREHEADER_NEW]]:
+; CHECK-NEXT:    br label %[[LOOP_4:.*]]
+; CHECK:       [[LOOP_2_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER]]
+; CHECK:       [[LOOP_4]]:
+; CHECK-NEXT:    [[IV_4:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER_NEW]] ], [ [[INC_I_7:%.*]], %[[LOOP_4]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP_4]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[INC_I_7]] = add nuw nsw i32 [[IV_4]], 8
+; CHECK-NEXT:    [[NITER_NEXT_7]] = add nuw nsw i32 [[NITER]], 8
+; CHECK-NEXT:    br i1 true, label %[[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_4]]
+; CHECK:       [[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[IV_4_UNR_PH:%.*]] = phi i32 [ [[INC_I_7]], %[[LOOP_4]] ]
+; CHECK-NEXT:    br label %[[LOOP_1_LATCH_UNR_LCSSA]]
+; CHECK:       [[LOOP_1_LATCH_UNR_LCSSA]]:
+; CHECK-NEXT:    [[IV_4_UNR:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER]] ], [ [[IV_4_UNR_PH]], %[[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_4_EPIL_PREHEADER:.*]], label %[[LOOP_1_LATCH:.*]]
+; CHECK:       [[LOOP_4_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_4_EPIL:.*]]
+; CHECK:       [[LOOP_4_EPIL]]:
+; CHECK-NEXT:    [[IV_4_EPIL:%.*]] = phi i32 [ [[INC_I_EPIL:%.*]], %[[LOOP_4_EPIL]] ], [ [[IV_4_UNR]], %[[LOOP_4_EPIL_PREHEADER]] ]
+; CHECK-NEXT:    [[EPIL_ITER:%.*]] = phi i32 [ 0, %[[LOOP_4_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_4_EPIL]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[INC_I_EPIL]] = add i32 [[IV_4_EPIL]], 1
+; CHECK-NEXT:    [[EC_EPIL:%.*]] = icmp eq i32 [[IV_4_EPIL]], [[IV_3_LCSSA_LCSSA]]
+; CHECK-NEXT:    [[EPIL_ITER_NEXT]] = add i32 [[EPIL_ITER]], 1
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_4_EPIL]], label %[[LOOP_1_LATCH_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[LOOP_1_LATCH_EPILOG_LCSSA]]:
+; CHECK-NEXT:    br label %[[LOOP_1_LATCH]]
+; CHECK:       [[LOOP_1_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER]]
+;
+entry:
+  br label %loop.1.header
+
+loop.1.header:
+  br label %loop.2.header
+
+loop.2.header:
+  br label %loop.3
+
+loop.3:
+  %iv.3 = phi i32 [ %iv.3.next, %loop.3 ], [ 0, %loop.2.header ]
+  call void @foo()
+  %iv.3.next = add i32 %iv.3, 1
+  br i1 false, label %ph, label %loop.3
+
+ph:
+  br i1 true, label %loop.2.latch, label %loop.4
+
+loop.2.latch:
+  br label %loop.2.header
+
+loop.4:
+  %iv.4 = phi i32 [ 0, %ph ], [ %inc.i, %loop.4 ]
+  call void @foo()
+  %inc.i = add i32 %iv.4, 1
+  %ec = icmp eq i32 %iv.4, %iv.3
+  br i1 %ec, label %loop.1.latch, label %loop.4
+
+loop.1.latch:
+  br label %loop.1.header
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index 43b9424..e44ddbc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -22,8 +22,7 @@ define void @test_blend_feeding_replicated_store_1(i64 %N, ptr noalias %src, ptr
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i1> zeroinitializer, <16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true)
@@ -213,8 +212,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP_SRC]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP4]], <16 x i1> [[TMP5]], <16 x i1> zeroinitializer
@@ -368,7 +366,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -388,7 +386,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index 8c2a48a..b7706da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -14,25 +14,23 @@ define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i64> [ <i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <2 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[VECTOR_RECUR]], <2 x i64> [[WIDE_LOAD]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[WIDE_LOAD]], <2 x i64> [[WIDE_LOAD1]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> splat (i64 1), <2 x i64> [[TMP6]], <2 x i64> splat (i64 1))
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> splat (i64 1), <2 x i64> [[TMP7]], <2 x i64> splat (i64 1))
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 2
-; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 1
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -47,7 +45,7 @@ define void @fshl_operand_first_order_recurrence(ptr %dst, ptr noalias %src) {
 ; CHECK-NEXT:    store i64 [[OR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -79,16 +77,14 @@ define void @powi_call(ptr %P) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[P]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[P]], i32 0
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[P]], align 8
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
index 812bca9..027a88d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -10,9 +10,9 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; We expect the branch weight computations after vectorisation to use
 ; vscale=2 for neoverse-v1 and vscale=1 for neoverse-v2.
-define void @_Z3foov() {
+define void @_Z3foov(i64 %n) {
 ; CHECK-V1-IC1-LABEL: define void @_Z3foov(
-; CHECK-V1-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V1-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-V1-IC1:  [[ENTRY:.*:]]
 ; CHECK-V1-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK-V1-IC1:  [[VECTOR_PH]]:
@@ -28,35 +28,45 @@ define void @_Z3foov() {
 ; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC1-LABEL: define void @_Z3foov(
-; CHECK-V2-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-V2-IC1:  [[ENTRY:.*:]]
-; CHECK-V2-IC1:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK-V2-IC1:  [[VECTOR_PH]]:
 ; CHECK-V2-IC1:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[VECTOR_BODY]]:
 ; CHECK-V2-IC1:    br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
-; CHECK-V2-IC1:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
 ; CHECK-V2-IC1:  [[SCALAR_PH]]:
 ; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC1:  [[FOR_BODY]]:
-; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
 ;
 ; CHECK-V2-IC4-LABEL: define void @_Z3foov(
-; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC4-SAME: i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY1:.*:]]
-; CHECK-V2-IC4:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
 ; CHECK-V2-IC4:  [[VECTOR_PH]]:
 ; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[VECTOR_BODY]]:
 ; CHECK-V2-IC4:    br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
-; CHECK-V2-IC4:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
-; CHECK-V2-IC4:  [[SCALAR_PH]]:
+; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP15:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
 ; CHECK-V2-IC4:  [[FOR_BODY]]:
-; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
 ;
 entry:
@@ -69,7 +79,7 @@ for.body:                                         ; preds = %for.body, %entry
   %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
   store i32 %load, ptr %arrayidx2, align 4
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1024
+  %exitcond = icmp eq i64 %iv.next, %n
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
 
 for.cond.cleanup:                                 ; preds = %for.body
@@ -91,9 +101,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
 ; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
-; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]], [[META3]]}
 ;.
 ; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
 ; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
@@ -101,6 +110,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
-; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 4, i32 0}
+; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 95f3eb7..795de3d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -33,8 +33,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -117,8 +116,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 46a194d..0232d88 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -82,9 +82,8 @@ define void @loop_dependent_cond(ptr %src, ptr noalias %dst, i64 %N) {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP3]], i32 0
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
 ; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD]])
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD1]])
@@ -341,9 +340,8 @@ define void @latch_branch_cost(ptr %dst) {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP2]], i32 16
-; DEFAULT-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; DEFAULT-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP2]], align 1
 ; DEFAULT-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -358,8 +356,7 @@ define void @latch_branch_cost(ptr %dst) {
 ; DEFAULT:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX1]]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
-; DEFAULT-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP9]], align 1
+; DEFAULT-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP8]], align 1
 ; DEFAULT-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
 ; DEFAULT-NEXT:    br i1 [[TMP10]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -575,8 +572,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; DEFAULT-NEXT:    store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META14]], !noalias [[META16]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE37]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE37]]:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
-; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP17]], i32 4, <4 x i1> [[TMP8]]), !alias.scope [[META18:![0-9]+]], !noalias [[META19:![0-9]+]]
+; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <4 x i1> [[TMP8]]), !alias.scope [[META18:![0-9]+]], !noalias [[META19:![0-9]+]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -674,13 +670,12 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; DEFAULT-NEXT:    store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
+; DEFAULT-NEXT:    store <8 x double> [[TMP3]], ptr [[NEXT_GEP]], align 8
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; DEFAULT-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
 ; DEFAULT:       [[SCALAR_PH]]:
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[ENTRY]] ]
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -696,7 +691,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
 ; DEFAULT-NEXT:    [[IV_CLAMP:%.*]] = and i64 [[IV]], 4294967294
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_CLAMP]], 512
-; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; DEFAULT:       [[EXIT]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -730,8 +725,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
 ; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
-; PRED-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[NEXT_GEP]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
 ; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -1492,7 +1486,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; DEFAULT-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
 ; DEFAULT:       [[SCALAR_PH]]:
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -1506,7 +1500,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
 ; DEFAULT-NEXT:    [[T:%.*]] = trunc nuw nsw i64 [[IV_NEXT]] to i32
 ; DEFAULT-NEXT:    store i32 [[T]], ptr [[DST]], align 4
 ; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 21
-; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP29:![0-9]+]]
 ; DEFAULT:       [[EXIT]]:
 ; DEFAULT-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
index 73ef8534b..06e6306 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -469,7 +469,7 @@ define void @old_and_new_size_equalko(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index d42be20..1ad1e426 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -38,11 +38,10 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = sext i32 [[TMP30]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP34]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], 2
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP34]], i64 [[TMP38]]
-; CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[TMP36]], align 8
+; CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[TMP34]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[TMP39]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -149,8 +148,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr [[TMP35]], i32 8, <vscale x 2 x i1> [[TMP23]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr [[TMP34]], i32 8, <vscale x 2 x i1> [[TMP23]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP36:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -275,8 +273,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
 ; CHECK-NEXT:    [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[TMP38]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP39]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP38]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
index e28c79e..221d944 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
@@ -15,15 +15,13 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[SRC_1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <4 x double> [[TMP3]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], -1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP8]], i32 8, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
@@ -58,23 +56,21 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x double> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], -1
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP27]], i32 8, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP26]], i32 8, <4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP28:%.*]] = fcmp oeq <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = or <4 x i1> [[TMP5]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP29]], i32 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP31]], i64 [[TMP25]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[PREDPHI]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[TMP32]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP33]], i32 4, <4 x i1> [[TMP30]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP32]], i32 4, <4 x i1> [[TMP30]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
index c824bee..ab008e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@@ -23,11 +23,10 @@ define void @f1(ptr %A) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> splat (i32 1), ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> splat (i32 1), ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 895781de..8013a8f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -27,9 +27,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; CHECK-NEXT:    [[INDEX4:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[INDEX4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD3]], splat (i8 3)
@@ -72,8 +71,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = trunc i32 [[INDEX6]] to i8
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD12]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP20]] = select <8 x i1> [[TMP19]], <8 x i8> [[VEC_IND7]], <8 x i8> [[VEC_PHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i32 [[INDEX6]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
index e6fad4b..37b5f5d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
@@ -19,20 +19,18 @@ define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture no
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 32
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 48
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 32
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 48
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
@@ -41,11 +39,10 @@ define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture no
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <16 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 16
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 32
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 48
-; CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP16]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP15]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP12]], ptr [[TMP17]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP13]], ptr [[TMP18]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP14]], ptr [[TMP19]], align 1
@@ -67,15 +64,12 @@ define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture no
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = add <8 x i8> [[WIDE_LOAD13]], [[WIDE_LOAD12]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP28]], align 1
+; CHECK-NEXT:    store <8 x i8> [[TMP26]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]]
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -135,20 +129,18 @@ define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 24
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 24
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP10]], align 1
@@ -157,11 +149,10 @@ define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <8 x i16> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <8 x i16> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 16
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 24
-; CHECK-NEXT:    store <8 x i16> [[TMP11]], ptr [[TMP16]], align 1
+; CHECK-NEXT:    store <8 x i16> [[TMP11]], ptr [[TMP15]], align 1
 ; CHECK-NEXT:    store <8 x i16> [[TMP12]], ptr [[TMP17]], align 1
 ; CHECK-NEXT:    store <8 x i16> [[TMP13]], ptr [[TMP18]], align 1
 ; CHECK-NEXT:    store <8 x i16> [[TMP14]], ptr [[TMP19]], align 1
@@ -183,15 +174,12 @@ define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i16>, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i16>, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[C]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i16>, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i16>, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i16> [[WIDE_LOAD13]], [[WIDE_LOAD12]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i16, ptr [[TMP27]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[TMP26]], ptr [[TMP28]], align 1
+; CHECK-NEXT:    store <4 x i16> [[TMP26]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]]
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -251,20 +239,18 @@ define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
@@ -273,11 +259,10 @@ define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 8
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 12
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP16]], align 1
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP15]], align 1
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP17]], align 1
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP18]], align 1
 ; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP19]], align 1
@@ -299,15 +284,12 @@ define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture n
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP24]], align 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[WIDE_LOAD13]], [[WIDE_LOAD12]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP28]], align 1
+; CHECK-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC10]]
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 83b35309..5b15896 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -36,9 +36,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
@@ -65,8 +64,7 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i32 0
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[NEXT_GEP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 10000
 ; CHECK-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
@@ -126,9 +124,8 @@ define void @test_widen_induction(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
@@ -153,8 +150,7 @@ define void @test_widen_induction(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND6:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND6]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND6]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add <2 x i64> [[VEC_IND6]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
@@ -213,9 +209,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[START]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
@@ -244,8 +239,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[VEC_IND11:%.*]] = phi <2 x i64> [ [[INDUCTION10]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX13:%.*]] = add i64 [[START]], [[INDEX7]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX13]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND11]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND11]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <2 x i64> [[VEC_IND11]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC3]]
@@ -300,9 +294,8 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 10)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 10)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
-; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
@@ -328,8 +321,7 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
 ; CHECK-NEXT:    [[VEC_IND8:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[VEC_IND8]], splat (i64 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <2 x i64> [[VEC_IND8]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[IND_END]]
@@ -374,66 +366,17 @@ exit:
 
 define void @test_widen_extended_induction(ptr %dst) {
 ; CHECK-LABEL: @test_widen_extended_induction(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[OFFSET_IDX]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST:%.*]], i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP3]], align 1
-; CHECK-NEXT:    store <2 x i8> [[STEP_ADD]], ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], splat (i8 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], <i8 0, i8 1>
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = trunc i32 [[INDEX2]] to i8
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[OFFSET_IDX5]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND3]], ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX2]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <2 x i8> [[VEC_IND3]], splat (i8 2)
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT6]], 10000
-; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_EXT:%.*]] = zext i8 [[IV]] to i64
-; CHECK-NEXT:    [[ARRAYIDX1449:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[IV_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1449:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST:%.*]], i64 0, i64 [[IV_EXT]]
 ; CHECK-NEXT:    store i8 [[IV]], ptr [[ARRAYIDX1449]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
 ; CHECK-NEXT:    [[IV_NEXT_EXT:%.*]] = zext i8 [[IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT_EXT]], 10000
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], {{!llvm.loop ![0-9]+}}
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -467,9 +410,8 @@ define void @test_widen_truncated_induction(ptr %A) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 2
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <2 x i8> [[VEC_IND]], ptr [[TMP1]], align 1
 ; CHECK-NEXT:    store <2 x i8> [[STEP_ADD]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], splat (i8 2)
@@ -490,8 +432,7 @@ define void @test_widen_truncated_induction(ptr %A) {
 ; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[VEC_IND3]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store <2 x i8> [[VEC_IND3]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX2]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <2 x i8> [[VEC_IND3]], splat (i8 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 10000
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 07c060a..19f2a36 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -33,8 +33,7 @@
 ; FORCED-LABEL: vector.body:                                      ; preds = %vector.body, %vector.ph
 ; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; FORCED-NEXT:    [[GEP:%.+]] = getelementptr i64, ptr %dst, i32 %index
-; FORCED-NEXT:    [[GEP2:%.+]] = getelementptr i64, ptr [[GEP]], i32 0
-; FORCED-NEXT:    store <2 x i64> [[ADD]], ptr [[GEP2]], align 4
+; FORCED-NEXT:    store <2 x i64> [[ADD]], ptr [[GEP]], align 4
 ; FORCED-NEXT:    %index.next = add nuw i32 %index, 2
 ; FORCED-NEXT:    [[C:%.+]] = icmp eq i32 %index.next, 1000
 ; FORCED-NEXT:    br i1 [[C]], label %middle.block, label %vector.body
@@ -84,8 +83,7 @@ declare float @powf(float, float) readnone nounwind
 ; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; FORCED-NEXT:    [[GEP1:%.+]] = getelementptr float, ptr %dst, i32 %index
 ; FORCED-NEXT:    [[POW:%.+]] = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat, <2 x float> %broadcast.splat2)
-; FORCED-NEXT:    [[GEP2:%.+]] = getelementptr float, ptr [[GEP1]], i32 0
-; FORCED-NEXT:    store <2 x float> [[POW]], ptr [[GEP2]], align 4
+; FORCED-NEXT:    store <2 x float> [[POW]], ptr [[GEP1]], align 4
 ; FORCED-NEXT:    %index.next = add nuw i32 %index, 2
 ; FORCED-NEXT:    [[C:%.+]] = icmp eq i32 %index.next, 1000
 ; FORCED-NEXT:    br i1 [[C]], label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
index 30e454d..fff99f1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
@@ -61,8 +61,7 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = sext <4 x i16> [[TMP25]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP28]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP29]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 1002)
 ; CHECK-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
index a6c5692..c94b3a4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll
@@ -26,18 +26,16 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i8> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD1]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[WIDE_LOAD1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
-; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP11]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -121,9 +119,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <16 x i8> [ [[VECTOR_RECUR_INIT3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD5]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD5]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -138,9 +135,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP17:%.*]] = add <16 x i8> [[TMP15]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <16 x i8> [[TMP16]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 16
-; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP19]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 427a05c..5de9d0e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -53,9 +53,8 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index 1a8e594..ea44fc3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -53,9 +53,8 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
index 403a5f1..2ed2819 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll
@@ -20,29 +20,26 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD5]], <4 x float> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 4
-; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -101,29 +98,26 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD5]], <4 x float> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 4
-; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -182,29 +176,26 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD5]], <2 x double> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 2
-; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -263,29 +254,26 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD5]], <2 x double> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 2
-; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -344,29 +332,26 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x half>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x half>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x half>, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD5]], <8 x half> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 8
-; CHECK-NEXT:    store <8 x half> [[TMP11]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store <8 x half> [[TMP11]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    store <8 x half> [[TMP13]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -425,29 +410,26 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 8
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x half>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x half>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x half>, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD5]], <8 x half> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 8
-; CHECK-NEXT:    store <8 x half> [[TMP11]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store <8 x half> [[TMP11]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    store <8 x half> [[TMP13]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 095ac22..c9cef14 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -19,11 +19,11 @@ define double @test_reduction_costs() {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi double [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX2:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_1:.*]]
 ; CHECK:       [[LOOP_1]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_1]] ]
@@ -79,9 +79,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[NEXT_GEP1]], align 1
 ; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -103,16 +102,15 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC3]]
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[INDEX4]]
+; CHECK-NEXT:    store <4 x i8> zeroinitializer, ptr [[NEXT_GEP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[START]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[CMP_N11]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[START]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N7]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END1]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END2]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ]
@@ -212,8 +210,7 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE55:.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[X_PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i64> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP47]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
@@ -279,7 +276,7 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP46]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -318,7 +315,7 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 64
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index 8471555..596a2ed 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -87,8 +87,7 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
 ; SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x double> [ insertelement (<vscale x 2 x double> splat (double -0.000000e+00), double 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[OFFSET:%.*]], i64 [[INDEX]]
-; SVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP6]], align 4
+; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP5]], align 4
 ; SVE-NEXT:    [[TMP7:%.*]] = sext <vscale x 2 x i32> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DATA:%.*]], <vscale x 2 x i64> [[TMP7]]
 ; SVE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> poison)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index dab1428..8b354d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -37,11 +37,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 8
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP24]]
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP22]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
 ; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP25]], align 1
 ; DEFAULT-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
 ; DEFAULT-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
@@ -56,11 +55,10 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
 ; DEFAULT-NEXT:    [[TMP37:%.*]] = trunc <vscale x 8 x i16> [[TMP35]] to <vscale x 8 x i8>
 ; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP38]], i32 0
 ; DEFAULT-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP41]], 8
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
-; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP40]], align 1
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP38]], align 1
 ; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -125,8 +123,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
-; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
 ; PRED-NEXT:    [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
 ; PRED-NEXT:    [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
@@ -134,8 +131,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
 ; PRED-NEXT:    [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP26]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
 ; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 767bc35..aa2ec2de 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -25,9 +25,8 @@ define i32 @multi_exit_iv_uniform(i32 %a, i64 %N, ptr %dst) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP5]], i32 4
-; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI1]], splat (i32 -1)
@@ -157,7 +156,6 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 4000
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -170,13 +168,12 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[START]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -224,9 +221,8 @@ define void @wide_truncated_iv(ptr %dst) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP2]], i32 8
-; CHECK-NEXT:    store <8 x i8> [[VEC_IND]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    store <8 x i8> [[VEC_IND]], ptr [[TMP2]], align 1
 ; CHECK-NEXT:    store <8 x i8> [[STEP_ADD]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[STEP_ADD]], splat (i8 8)
@@ -247,8 +243,7 @@ define void @wide_truncated_iv(ptr %dst) {
 ; CHECK-NEXT:    [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[VEC_IND4:%.*]] = phi <8 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[VEC_IND4]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    store <8 x i8> [[VEC_IND4]], ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX3]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <8 x i8> [[VEC_IND4]], splat (i8 8)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 200
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
index 79d7ab8..f92aa06 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
@@ -35,10 +35,10 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i24> poison, i24 [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i24> [[TMP17]], i24 [[TMP14]], i32 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i24> [[TMP18]], i24 [[TMP15]], i32 2
@@ -46,8 +46,7 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <4 x i24> [[TMP20]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[TMP24]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[TMP23]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 5b8acee..649be65 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -32,8 +32,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP9]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64>
 ; CHECK-NEXT:    [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -82,8 +81,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]]
@@ -172,8 +170,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP27]], align 1
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP26]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64>
 ; CHECK-NEXT:    [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -222,8 +219,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP33]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = zext <vscale x 2 x i32> [[TMP31]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP35]] = or <vscale x 2 x i64> [[VEC_PHI8]], [[TMP34]]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
index 890e13c..9bd3d30 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
@@ -35,11 +35,10 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-4:       vector.body:
 ; INTERLEAVE-4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
 ; INTERLEAVE-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
 ; INTERLEAVE-4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 32
 ; INTERLEAVE-4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 48
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
@@ -56,11 +55,10 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-4-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP15]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP19]]
 ; INTERLEAVE-4-NEXT:    [[TMP24:%.*]] = select <16 x i1> [[TMP16]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP20]]
 ; INTERLEAVE-4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
 ; INTERLEAVE-4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 16
 ; INTERLEAVE-4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 32
 ; INTERLEAVE-4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 48
-; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP21]], ptr [[TMP29]], align 1
+; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP21]], ptr [[TMP25]], align 1
 ; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP22]], ptr [[TMP30]], align 1
 ; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP23]], ptr [[TMP31]], align 1
 ; INTERLEAVE-4-NEXT:    store <16 x i8> [[TMP24]], ptr [[TMP32]], align 1
@@ -86,14 +84,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-4:       vec.epilog.vector.body:
 ; INTERLEAVE-4-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; INTERLEAVE-4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX12]]
-; INTERLEAVE-4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i32 0
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP36]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP35]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP37:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD13]], [[BROADCAST_SPLAT15]]
 ; INTERLEAVE-4-NEXT:    [[TMP38:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD13]], <8 x i8> [[BROADCAST_SPLAT17]])
 ; INTERLEAVE-4-NEXT:    [[TMP39:%.*]] = select <8 x i1> [[TMP37]], <8 x i8> [[BROADCAST_SPLAT15]], <8 x i8> [[TMP38]]
 ; INTERLEAVE-4-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX12]]
-; INTERLEAVE-4-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP40]], i32 0
-; INTERLEAVE-4-NEXT:    store <8 x i8> [[TMP39]], ptr [[TMP41]], align 1
+; INTERLEAVE-4-NEXT:    store <8 x i8> [[TMP39]], ptr [[TMP40]], align 1
 ; INTERLEAVE-4-NEXT:    [[INDEX_NEXT18]] = add nuw i64 [[INDEX12]], 8
 ; INTERLEAVE-4-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC10]]
 ; INTERLEAVE-4-NEXT:    br i1 [[TMP42]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -142,9 +138,8 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-2:       vector.body:
 ; INTERLEAVE-2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; INTERLEAVE-2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
-; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; INTERLEAVE-2-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP7:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-2-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]]
@@ -153,9 +148,8 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-2-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP9]]
 ; INTERLEAVE-2-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP10]]
 ; INTERLEAVE-2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
 ; INTERLEAVE-2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 16
-; INTERLEAVE-2-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP15]], align 1
+; INTERLEAVE-2-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP13]], align 1
 ; INTERLEAVE-2-NEXT:    store <16 x i8> [[TMP12]], ptr [[TMP16]], align 1
 ; INTERLEAVE-2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; INTERLEAVE-2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -179,14 +173,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
 ; INTERLEAVE-2:       vec.epilog.vector.body:
 ; INTERLEAVE-2-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; INTERLEAVE-2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX10]]
-; INTERLEAVE-2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0
-; INTERLEAVE-2-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP20]], align 1
+; INTERLEAVE-2-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP19]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP21:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD11]], [[BROADCAST_SPLAT13]]
 ; INTERLEAVE-2-NEXT:    [[TMP22:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD11]], <8 x i8> [[BROADCAST_SPLAT15]])
 ; INTERLEAVE-2-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP21]], <8 x i8> [[BROADCAST_SPLAT13]], <8 x i8> [[TMP22]]
 ; INTERLEAVE-2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX10]]
-; INTERLEAVE-2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0
-; INTERLEAVE-2-NEXT:    store <8 x i8> [[TMP23]], ptr [[TMP25]], align 1
+; INTERLEAVE-2-NEXT:    store <8 x i8> [[TMP23]], ptr [[TMP24]], align 1
 ; INTERLEAVE-2-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX10]], 8
 ; INTERLEAVE-2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC8]]
 ; INTERLEAVE-2-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
index 449bcaa..f069347 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
@@ -29,11 +29,10 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-4-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; INTERLEAVE-4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; INTERLEAVE-4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
 ; INTERLEAVE-4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
 ; INTERLEAVE-4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
 ; INTERLEAVE-4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1
@@ -66,8 +65,7 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-4-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; INTERLEAVE-4-NEXT:    [[VEC_PHI13:%.*]] = phi <4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; INTERLEAVE-4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX12]]
-; INTERLEAVE-4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
-; INTERLEAVE-4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP21]], align 1
+; INTERLEAVE-4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP20]], align 1
 ; INTERLEAVE-4-NEXT:    [[TMP22]] = add <4 x i32> [[VEC_PHI13]], [[WIDE_LOAD14]]
 ; INTERLEAVE-4-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4
 ; INTERLEAVE-4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
@@ -106,9 +104,8 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; INTERLEAVE-2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; INTERLEAVE-2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
-; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1
+; INTERLEAVE-2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
 ; INTERLEAVE-2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
 ; INTERLEAVE-2-NEXT:    [[TMP6]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; INTERLEAVE-2-NEXT:    [[TMP7]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -138,10 +135,69 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
 ; INTERLEAVE-2-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
 ; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
-; INTERLEAVE-4-VLA:       add <vscale x 4 x i32>
-; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
-; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
-; INTERLEAVE-4-VLA-NEXT:  add <vscale x 4 x i32>
+; INTERLEAVE-4-VLA-NEXT:  entry:
+; INTERLEAVE-4-VLA-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; INTERLEAVE-4-VLA-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INTERLEAVE-4-VLA:       vector.ph:
+; INTERLEAVE-4-VLA-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; INTERLEAVE-4-VLA-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; INTERLEAVE-4-VLA-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
+; INTERLEAVE-4-VLA-NEXT:    br label [[VECTOR_BODY:%.*]]
+; INTERLEAVE-4-VLA:       vector.body:
+; INTERLEAVE-4-VLA-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; INTERLEAVE-4-VLA-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 [[TMP8]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 8
+; INTERLEAVE-4-VLA-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 [[TMP11]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 12
+; INTERLEAVE-4-VLA-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 [[TMP14]]
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, ptr [[TMP15]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[TMP16]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[VEC_PHI1]], [[WIDE_LOAD4]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[VEC_PHI2]], [[WIDE_LOAD5]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP19]] = add <vscale x 4 x i32> [[VEC_PHI3]], [[WIDE_LOAD6]]
+; INTERLEAVE-4-VLA-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-4-VLA:       middle.block:
+; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP16]]
+; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX7:%.*]] = add <vscale x 4 x i32> [[TMP18]], [[BIN_RDX]]
+; INTERLEAVE-4-VLA-NEXT:    [[BIN_RDX8:%.*]] = add <vscale x 4 x i32> [[TMP19]], [[BIN_RDX7]]
+; INTERLEAVE-4-VLA-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX8]])
+; INTERLEAVE-4-VLA-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; INTERLEAVE-4-VLA:       scalar.ph:
+; INTERLEAVE-4-VLA-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; INTERLEAVE-4-VLA-NEXT:    br label [[LOOP:%.*]]
+; INTERLEAVE-4-VLA:       loop:
+; INTERLEAVE-4-VLA-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
+; INTERLEAVE-4-VLA-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; INTERLEAVE-4-VLA-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
+; INTERLEAVE-4-VLA-NEXT:    [[RED_NEXT]] = add i32 [[RED]], [[L]]
+; INTERLEAVE-4-VLA-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-4-VLA-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-4-VLA-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; INTERLEAVE-4-VLA:       exit:
+; INTERLEAVE-4-VLA-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; INTERLEAVE-4-VLA-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
index 26e96ca..d45dbcc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
@@ -50,9 +50,9 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) {
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
index e6d5c54..8c4eba6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll
@@ -23,16 +23,15 @@ define void @licm_replicate_call(double %x, ptr %dst) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i32 2
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
@@ -46,7 +45,7 @@ define void @licm_replicate_call(double %x, ptr %dst) {
 ; CHECK-NEXT:    store double [[MUL]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 128
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index 57d5b43..5066a9b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -24,15 +24,13 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD]], splat (i8 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], splat (i8 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -47,16 +45,14 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[WIDE_LOAD5]], splat (i8 2)
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP9]], ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX4]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[WIDE_LOAD5]], splat (i8 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX4]]
+; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -70,8 +66,8 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP9]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
@@ -125,15 +121,13 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <16 x i8> [[WIDE_LOAD]], splat (i8 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <16 x i8> [[WIDE_LOAD]], splat (i8 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -148,16 +142,14 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw <4 x i8> [[WIDE_LOAD5]], splat (i8 2)
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP9]], ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX4]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <4 x i8> [[WIDE_LOAD5]], splat (i8 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX4]]
+; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -171,8 +163,8 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i8 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i8 [[TMP9]], 2
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -217,15 +209,13 @@ define void @add_b(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[WIDE_LOAD]], splat (i16 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i16> [[WIDE_LOAD]], splat (i16 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -239,8 +229,8 @@ define void @add_b(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV8:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV8:%.*]] = zext i16 [[TMP5]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV8]], 2
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]]
@@ -292,16 +282,14 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i16> [[TMP3]], splat (i16 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <16 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP2]], splat (i16 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -316,17 +304,15 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD5]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i16> [[TMP10]], splat (i16 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP13]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX4]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[WIDE_LOAD5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], splat (i16 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -340,8 +326,8 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDVARS_IV]]
@@ -390,16 +376,14 @@ define void @add_d(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i32> [[TMP3]], splat (i32 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <8 x i32> [[TMP2]], splat (i32 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -413,8 +397,8 @@ define void @add_d(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP6]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
@@ -470,22 +454,20 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i8> [[TMP3]], splat (i8 32)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i8> [[WIDE_LOAD]], splat (i8 51)
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], splat (i8 60)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i8> [[TMP4]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP6]], splat (i8 -4)
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i8> [[TMP8]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP2]], splat (i8 32)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i8> [[WIDE_LOAD]], splat (i8 51)
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i8> [[TMP4]], splat (i8 60)
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i8> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i8> [[TMP5]], splat (i8 -4)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i8> [[TMP7]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP8]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -504,23 +486,21 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <4 x i8> [[WIDE_LOAD11]], splat (i8 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = add <4 x i8> [[TMP16]], splat (i8 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i8> [[WIDE_LOAD11]], splat (i8 51)
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i8> [[TMP18]], splat (i8 60)
-; CHECK-NEXT:    [[TMP20:%.*]] = and <4 x i8> [[TMP17]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <4 x i8> [[TMP19]], splat (i8 -4)
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i8> [[TMP21]], [[BROADCAST_SPLAT9]]
-; CHECK-NEXT:    [[TMP23:%.*]] = mul <4 x i8> [[TMP22]], [[TMP20]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX10]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP23]], ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX10]]
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i8> [[WIDE_LOAD11]], splat (i8 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[TMP13]], splat (i8 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i8> [[WIDE_LOAD11]], splat (i8 51)
+; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i8> [[TMP15]], splat (i8 60)
+; CHECK-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[TMP14]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[TMP16]], splat (i8 -4)
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i8> [[TMP18]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <4 x i8> [[TMP19]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX10]]
+; CHECK-NEXT:    store <4 x i8> [[TMP20]], ptr [[TMP21]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -534,8 +514,8 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP23]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = shl i32 [[CONV]], 4
 ; CHECK-NEXT:    [[CONV2:%.*]] = add nuw nsw i32 [[ADD]], 32
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[CONV]], 51
@@ -612,24 +592,22 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl <16 x i8> [[TMP3]], splat (i8 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[TMP4]], splat (i8 32)
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i8> [[TMP3]], splat (i8 -52)
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i8> [[TMP6]], splat (i8 51)
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <16 x i8> [[TMP7]], splat (i8 60)
-; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i8> [[TMP8]], splat (i8 -4)
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i8> [[TMP10]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP11]], [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP12]], ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i16> [[WIDE_LOAD]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[TMP2]], splat (i8 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i8> [[TMP3]], splat (i8 32)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i8> [[TMP2]], splat (i8 -52)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i8> [[TMP5]], splat (i8 51)
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <16 x i8> [[TMP6]], splat (i8 60)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP7]], splat (i8 -4)
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i8> [[TMP9]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -648,25 +626,23 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDEX10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i16>, ptr [[TMP17]], align 2
-; CHECK-NEXT:    [[TMP18:%.*]] = trunc <4 x i16> [[WIDE_LOAD11]] to <4 x i8>
-; CHECK-NEXT:    [[TMP19:%.*]] = shl <4 x i8> [[TMP18]], splat (i8 4)
-; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i8> [[TMP19]], splat (i8 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = and <4 x i8> [[TMP18]], splat (i8 -52)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i8> [[TMP21]], splat (i8 51)
-; CHECK-NEXT:    [[TMP23:%.*]] = mul <4 x i8> [[TMP22]], splat (i8 60)
-; CHECK-NEXT:    [[TMP24:%.*]] = and <4 x i8> [[TMP20]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT:    [[TMP25:%.*]] = and <4 x i8> [[TMP23]], splat (i8 -4)
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <4 x i8> [[TMP25]], [[BROADCAST_SPLAT9]]
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <4 x i8> [[TMP26]], [[TMP24]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX10]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP27]], ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDEX10]]
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i16>, ptr [[TMP14]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i16> [[WIDE_LOAD11]] to <4 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <4 x i8> [[TMP15]], splat (i8 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = add <4 x i8> [[TMP16]], splat (i8 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[TMP15]], splat (i8 -52)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <4 x i8> [[TMP18]], splat (i8 51)
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <4 x i8> [[TMP19]], splat (i8 60)
+; CHECK-NEXT:    [[TMP21:%.*]] = and <4 x i8> [[TMP17]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and <4 x i8> [[TMP20]], splat (i8 -4)
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i8> [[TMP22]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    [[TMP24:%.*]] = mul <4 x i8> [[TMP23]], [[TMP21]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX10]]
+; CHECK-NEXT:    store <4 x i8> [[TMP24]], ptr [[TMP25]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], 4
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -680,8 +656,8 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q,
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP31]] to i32
+; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP27]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = shl i32 [[CONV]], 4
 ; CHECK-NEXT:    [[CONV2:%.*]] = add nsw i32 [[ADD]], 32
 ; CHECK-NEXT:    [[OR:%.*]] = and i32 [[CONV]], 204
@@ -751,19 +727,17 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <16 x i32> [[TMP3]], splat (i32 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <16 x i32> [[TMP2]], splat (i32 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP3]], i32 15
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP2]], i32 15
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -778,8 +752,8 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[A_PHI:%.*]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
@@ -832,20 +806,18 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <16 x i32> [[TMP5]], splat (i32 2)
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <16 x i32> [[TMP4]], splat (i32 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[TMP5]] to <16 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
+; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP5]], i32 15
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP5]], i32 14
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 15
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP4]], i32 14
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -860,8 +832,8 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[A_PHI]] = phi i32 [ [[CONV:%.*]], [[FOR_BODY]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV]] = zext i8 [[TMP9]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV]], 2
 ; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[ADD]] to i8
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDVARS_IV]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 6499a1f..5f5d326 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -81,10 +81,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VS1-NEXT:    [[TMP20:%.*]] = add i64 [[TMP0]], [[INDEX]]
 ; CHECK-VS1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[TMP20]]
-; CHECK-VS1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP22]], i32 0
-; CHECK-VS1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-VS1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP22]], align 1
 ; CHECK-VS1-NEXT:    [[TMP24:%.*]] = add <vscale x 16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-VS1-NEXT:    store <vscale x 16 x i8> [[TMP24]], ptr [[TMP23]], align 1
+; CHECK-VS1-NEXT:    store <vscale x 16 x i8> [[TMP24]], ptr [[TMP22]], align 1
 ; CHECK-VS1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
 ; CHECK-VS1-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -114,10 +113,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-VS1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
 ; CHECK-VS1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
-; CHECK-VS1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i32 0
-; CHECK-VS1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x i8>, ptr [[TMP34]], align 1
+; CHECK-VS1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x i8>, ptr [[TMP33]], align 1
 ; CHECK-VS1-NEXT:    [[TMP35:%.*]] = add <vscale x 8 x i8> [[WIDE_LOAD6]], [[BROADCAST_SPLAT8]]
-; CHECK-VS1-NEXT:    store <vscale x 8 x i8> [[TMP35]], ptr [[TMP34]], align 1
+; CHECK-VS1-NEXT:    store <vscale x 8 x i8> [[TMP35]], ptr [[TMP33]], align 1
 ; CHECK-VS1-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP31]]
 ; CHECK-VS1-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
 ; CHECK-VS1-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -187,10 +185,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VS2-NEXT:    [[TMP20:%.*]] = add i64 [[TMP0]], [[INDEX]]
 ; CHECK-VS2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[TMP20]]
-; CHECK-VS2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP22]], i32 0
-; CHECK-VS2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP23]], align 1
+; CHECK-VS2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP22]], align 1
 ; CHECK-VS2-NEXT:    [[TMP24:%.*]] = add <vscale x 8 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-VS2-NEXT:    store <vscale x 8 x i8> [[TMP24]], ptr [[TMP23]], align 1
+; CHECK-VS2-NEXT:    store <vscale x 8 x i8> [[TMP24]], ptr [[TMP22]], align 1
 ; CHECK-VS2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
 ; CHECK-VS2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -220,10 +217,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-VS2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
 ; CHECK-VS2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
-; CHECK-VS2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP33]], i32 0
-; CHECK-VS2-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP34]], align 1
+; CHECK-VS2-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP33]], align 1
 ; CHECK-VS2-NEXT:    [[TMP35:%.*]] = add <vscale x 4 x i8> [[WIDE_LOAD6]], [[BROADCAST_SPLAT8]]
-; CHECK-VS2-NEXT:    store <vscale x 4 x i8> [[TMP35]], ptr [[TMP34]], align 1
+; CHECK-VS2-NEXT:    store <vscale x 4 x i8> [[TMP35]], ptr [[TMP33]], align 1
 ; CHECK-VS2-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP31]]
 ; CHECK-VS2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
 ; CHECK-VS2-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -435,10 +431,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -501,17 +496,16 @@ define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index a8d4442..1159a64 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -61,10 +61,10 @@ define i32 @add_reduction_select_operand_constant_but_non_uniform() {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 42, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 42, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD2_REASS:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index f0835b8..1471896 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -24,16 +24,14 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; DEFAULT-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    store <4 x i32> [[TMP3]], ptr [[P]], align 4
 ; DEFAULT-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
 ; DEFAULT:       [[FOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -56,16 +54,14 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; OPTSIZE:       [[VECTOR_BODY]]:
-; OPTSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; OPTSIZE-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; OPTSIZE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; OPTSIZE-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4
+; OPTSIZE-NEXT:    store <4 x i32> [[TMP3]], ptr [[P]], align 4
 ; OPTSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
-; OPTSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; OPTSIZE:       [[SCALAR_PH]]:
-; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; OPTSIZE:       [[FOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -88,16 +84,14 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; MINSIZE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; MINSIZE:       [[VECTOR_BODY]]:
-; MINSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; MINSIZE-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; MINSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; MINSIZE-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; MINSIZE-NEXT:    store <4 x i32> [[TMP2]], ptr [[P]], align 4
 ; MINSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
-; MINSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; MINSIZE:       [[SCALAR_PH]]:
-; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; MINSIZE:       [[FOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -145,13 +139,12 @@ define void @vectorize_without_optsize(ptr %p, i32 %x, i64 %n) {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
-; DEFAULT-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP3]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -239,9 +232,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ]
-; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ]
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ]
+; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ]
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14)
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
@@ -260,140 +253,140 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE]]:
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; DEFAULT:       [[PRED_STORE_IF7]]:
+; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; DEFAULT:       [[PRED_STORE_IF6]]:
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 1
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]]
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1
 ; DEFAULT-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
-; DEFAULT:       [[PRED_STORE_CONTINUE8]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; DEFAULT:       [[PRED_STORE_CONTINUE7]]:
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
-; DEFAULT:       [[PRED_STORE_IF9]]:
+; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; DEFAULT:       [[PRED_STORE_IF8]]:
 ; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]]
 ; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2
 ; DEFAULT-NEXT:    store i8 [[TMP19]], ptr [[TMP18]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
-; DEFAULT:       [[PRED_STORE_CONTINUE10]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; DEFAULT:       [[PRED_STORE_CONTINUE9]]:
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
-; DEFAULT:       [[PRED_STORE_IF11]]:
+; DEFAULT-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
+; DEFAULT:       [[PRED_STORE_IF10]]:
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]]
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3
 ; DEFAULT-NEXT:    store i8 [[TMP23]], ptr [[TMP22]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
-; DEFAULT:       [[PRED_STORE_CONTINUE12]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; DEFAULT:       [[PRED_STORE_CONTINUE11]]:
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
-; DEFAULT:       [[PRED_STORE_IF13]]:
+; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; DEFAULT:       [[PRED_STORE_IF12]]:
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]]
 ; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4
 ; DEFAULT-NEXT:    store i8 [[TMP27]], ptr [[TMP26]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
-; DEFAULT:       [[PRED_STORE_CONTINUE14]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
+; DEFAULT:       [[PRED_STORE_CONTINUE13]]:
 ; DEFAULT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; DEFAULT-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
-; DEFAULT:       [[PRED_STORE_IF15]]:
+; DEFAULT-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
+; DEFAULT:       [[PRED_STORE_IF14]]:
 ; DEFAULT-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 5
 ; DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]]
 ; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5
 ; DEFAULT-NEXT:    store i8 [[TMP31]], ptr [[TMP30]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
-; DEFAULT:       [[PRED_STORE_CONTINUE16]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
+; DEFAULT:       [[PRED_STORE_CONTINUE15]]:
 ; DEFAULT-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; DEFAULT-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
-; DEFAULT:       [[PRED_STORE_IF17]]:
+; DEFAULT-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
+; DEFAULT:       [[PRED_STORE_IF16]]:
 ; DEFAULT-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 6
 ; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]]
 ; DEFAULT-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6
 ; DEFAULT-NEXT:    store i8 [[TMP35]], ptr [[TMP34]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
-; DEFAULT:       [[PRED_STORE_CONTINUE18]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
+; DEFAULT:       [[PRED_STORE_CONTINUE17]]:
 ; DEFAULT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; DEFAULT-NEXT:    br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
-; DEFAULT:       [[PRED_STORE_IF19]]:
+; DEFAULT-NEXT:    br i1 [[TMP36]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
+; DEFAULT:       [[PRED_STORE_IF18]]:
 ; DEFAULT-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 7
 ; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]]
 ; DEFAULT-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7
 ; DEFAULT-NEXT:    store i8 [[TMP39]], ptr [[TMP38]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
-; DEFAULT:       [[PRED_STORE_CONTINUE20]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE19]]
+; DEFAULT:       [[PRED_STORE_CONTINUE19]]:
 ; DEFAULT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; DEFAULT-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
-; DEFAULT:       [[PRED_STORE_IF21]]:
+; DEFAULT-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
+; DEFAULT:       [[PRED_STORE_IF20]]:
 ; DEFAULT-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], 8
 ; DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]]
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8
 ; DEFAULT-NEXT:    store i8 [[TMP43]], ptr [[TMP42]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
-; DEFAULT:       [[PRED_STORE_CONTINUE22]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE21]]
+; DEFAULT:       [[PRED_STORE_CONTINUE21]]:
 ; DEFAULT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
-; DEFAULT:       [[PRED_STORE_IF23]]:
+; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
+; DEFAULT:       [[PRED_STORE_IF22]]:
 ; DEFAULT-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX]], 9
 ; DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]]
 ; DEFAULT-NEXT:    [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9
 ; DEFAULT-NEXT:    store i8 [[TMP47]], ptr [[TMP46]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
-; DEFAULT:       [[PRED_STORE_CONTINUE24]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE23]]
+; DEFAULT:       [[PRED_STORE_CONTINUE23]]:
 ; DEFAULT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; DEFAULT-NEXT:    br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
-; DEFAULT:       [[PRED_STORE_IF25]]:
+; DEFAULT-NEXT:    br i1 [[TMP48]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
+; DEFAULT:       [[PRED_STORE_IF24]]:
 ; DEFAULT-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX]], 10
 ; DEFAULT-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]]
 ; DEFAULT-NEXT:    [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10
 ; DEFAULT-NEXT:    store i8 [[TMP51]], ptr [[TMP50]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
-; DEFAULT:       [[PRED_STORE_CONTINUE26]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE25]]
+; DEFAULT:       [[PRED_STORE_CONTINUE25]]:
 ; DEFAULT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; DEFAULT-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
-; DEFAULT:       [[PRED_STORE_IF27]]:
+; DEFAULT-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]]
+; DEFAULT:       [[PRED_STORE_IF26]]:
 ; DEFAULT-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX]], 11
 ; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]]
 ; DEFAULT-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11
 ; DEFAULT-NEXT:    store i8 [[TMP55]], ptr [[TMP54]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
-; DEFAULT:       [[PRED_STORE_CONTINUE28]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE27]]
+; DEFAULT:       [[PRED_STORE_CONTINUE27]]:
 ; DEFAULT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; DEFAULT-NEXT:    br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
-; DEFAULT:       [[PRED_STORE_IF29]]:
+; DEFAULT-NEXT:    br i1 [[TMP56]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; DEFAULT:       [[PRED_STORE_IF28]]:
 ; DEFAULT-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 12
 ; DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]]
 ; DEFAULT-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12
 ; DEFAULT-NEXT:    store i8 [[TMP59]], ptr [[TMP58]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
-; DEFAULT:       [[PRED_STORE_CONTINUE30]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE29]]
+; DEFAULT:       [[PRED_STORE_CONTINUE29]]:
 ; DEFAULT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; DEFAULT-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
-; DEFAULT:       [[PRED_STORE_IF31]]:
+; DEFAULT-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; DEFAULT:       [[PRED_STORE_IF30]]:
 ; DEFAULT-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 13
 ; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]]
 ; DEFAULT-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13
 ; DEFAULT-NEXT:    store i8 [[TMP63]], ptr [[TMP62]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
-; DEFAULT:       [[PRED_STORE_CONTINUE32]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE31]]
+; DEFAULT:       [[PRED_STORE_CONTINUE31]]:
 ; DEFAULT-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; DEFAULT-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
-; DEFAULT:       [[PRED_STORE_IF33]]:
+; DEFAULT-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; DEFAULT:       [[PRED_STORE_IF32]]:
 ; DEFAULT-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 14
 ; DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]]
 ; DEFAULT-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14
 ; DEFAULT-NEXT:    store i8 [[TMP67]], ptr [[TMP66]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE34]]
-; DEFAULT:       [[PRED_STORE_CONTINUE34]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
+; DEFAULT:       [[PRED_STORE_CONTINUE33]]:
 ; DEFAULT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; DEFAULT-NEXT:    br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]]
-; DEFAULT:       [[PRED_STORE_IF35]]:
+; DEFAULT-NEXT:    br i1 [[TMP68]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]]
+; DEFAULT:       [[PRED_STORE_IF34]]:
 ; DEFAULT-NEXT:    [[TMP69:%.*]] = add i64 [[INDEX]], 15
 ; DEFAULT-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]]
 ; DEFAULT-NEXT:    [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
-; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
+; DEFAULT:       [[PRED_STORE_CONTINUE35]]:
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
@@ -540,8 +533,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; DEFAULT-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; DEFAULT-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -616,8 +608,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; OPTSIZE-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; OPTSIZE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; OPTSIZE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; OPTSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; OPTSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; OPTSIZE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; OPTSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; OPTSIZE-NEXT:    [[TMP24:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -692,8 +683,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; MINSIZE-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; MINSIZE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; MINSIZE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP22]], i32 0
-; MINSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP23]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; MINSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; MINSIZE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; MINSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; MINSIZE-NEXT:    [[TMP24:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -760,35 +750,32 @@ define void @dont_vectorize_with_minsize() {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 8
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 8
-; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; DEFAULT-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 8
-; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16>
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = add <8 x i16> [[TMP12]], [[WIDE_LOAD4]]
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = add <8 x i16> [[TMP13]], [[WIDE_LOAD5]]
-; DEFAULT-NEXT:    store <8 x i16> [[TMP14]], ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    store <8 x i16> [[TMP14]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    store <8 x i16> [[TMP15]], ptr [[TMP11]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
 ; DEFAULT:       [[FOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -817,25 +804,22 @@ define void @dont_vectorize_with_minsize() {
 ; OPTSIZE:       [[VECTOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; OPTSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDEX]]
-; OPTSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; OPTSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDEX]]
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; OPTSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; OPTSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDEX]]
-; OPTSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = trunc <8 x i32> [[TMP5]] to <8 x i16>
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; OPTSIZE-NEXT:    store <8 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    store <8 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; OPTSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
-; OPTSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; OPTSIZE:       [[SCALAR_PH]]:
-; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; OPTSIZE:       [[FOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -864,25 +848,22 @@ define void @dont_vectorize_with_minsize() {
 ; MINSIZE:       [[VECTOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; MINSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDEX]]
-; MINSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; MINSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDEX]]
-; MINSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; MINSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; MINSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDEX]]
-; MINSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; MINSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
-; MINSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; MINSIZE:       [[SCALAR_PH]]:
-; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; MINSIZE:       [[FOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -939,35 +920,32 @@ define void @vectorization_forced_minsize_reduce_width() {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 8
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 8
-; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; DEFAULT-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 8
-; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = trunc <8 x i32> [[TMP7]] to <8 x i16>
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16>
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = add <8 x i16> [[TMP12]], [[WIDE_LOAD4]]
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = add <8 x i16> [[TMP13]], [[WIDE_LOAD5]]
-; DEFAULT-NEXT:    store <8 x i16> [[TMP14]], ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    store <8 x i16> [[TMP14]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    store <8 x i16> [[TMP15]], ptr [[TMP11]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
 ; DEFAULT:       [[FOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -996,25 +974,22 @@ define void @vectorization_forced_minsize_reduce_width() {
 ; OPTSIZE:       [[VECTOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; OPTSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDEX]]
-; OPTSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; OPTSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDEX]]
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; OPTSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; OPTSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDEX]]
-; OPTSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = trunc <8 x i32> [[TMP5]] to <8 x i16>
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; OPTSIZE-NEXT:    store <8 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    store <8 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; OPTSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
-; OPTSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; OPTSIZE:       [[SCALAR_PH]]:
-; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; OPTSIZE:       [[FOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -1043,25 +1018,22 @@ define void @vectorization_forced_minsize_reduce_width() {
 ; MINSIZE:       [[VECTOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; MINSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDEX]]
-; MINSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; MINSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDEX]]
-; MINSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; MINSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; MINSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDEX]]
-; MINSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; MINSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
-; MINSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; MINSIZE:       [[SCALAR_PH]]:
-; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; MINSIZE:       [[FOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index b02b314..787d63c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -26,12 +26,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -73,12 +70,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -119,12 +113,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
@@ -192,12 +183,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -238,12 +226,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -284,12 +269,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
@@ -356,12 +338,9 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -403,12 +382,9 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -449,12 +425,9 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
@@ -524,12 +497,9 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -572,12 +542,9 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -618,12 +585,9 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
@@ -695,12 +659,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -743,12 +704,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -791,12 +749,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
@@ -870,12 +825,9 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -920,12 +872,9 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -968,12 +917,9 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
@@ -1050,12 +996,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -1095,12 +1038,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -1140,12 +1080,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
@@ -1210,10 +1147,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEON-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
@@ -1250,10 +1185,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP11]]
@@ -1290,10 +1223,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]])
@@ -1354,12 +1285,9 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
-; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -1399,12 +1327,9 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
@@ -1444,12 +1369,9 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 7090ae8..be8cfa2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -15,12 +15,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
@@ -29,8 +27,6 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
-; CHECK:       scalar.ph:
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 1d37414..d01effd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -25,20 +25,18 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
@@ -75,20 +73,18 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
@@ -148,20 +144,18 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
@@ -198,20 +192,18 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 8
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
@@ -263,16 +255,14 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
@@ -286,7 +276,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 ; CHECK-NOI8MM-LABEL: define i32 @sudot_neon(
@@ -300,16 +290,14 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
@@ -323,7 +311,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
 ;
 entry:
@@ -360,16 +348,14 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
@@ -383,7 +369,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 ; CHECK-NOI8MM-LABEL: define i32 @usdot_neon(
@@ -397,16 +383,14 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
@@ -420,7 +404,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 0c79086..25a4ab8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -17,12 +17,10 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
@@ -31,7 +29,7 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
@@ -45,16 +43,14 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
@@ -68,7 +64,7 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @dotp(
@@ -81,12 +77,10 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
@@ -95,7 +89,7 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -147,8 +141,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
@@ -206,7 +199,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
@@ -252,9 +245,8 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -366,7 +358,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
@@ -395,8 +387,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
@@ -454,7 +445,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -490,12 +481,10 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -504,9 +493,6 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
@@ -519,12 +505,10 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -533,9 +517,6 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
@@ -548,12 +529,10 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -562,9 +541,6 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -600,26 +576,22 @@ define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; CHECK-INTERLEAVE1-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP13]], align 4
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
@@ -632,26 +604,22 @@ define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVED-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; CHECK-INTERLEAVED-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP13]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
@@ -664,26 +632,22 @@ define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-MAXBW-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; CHECK-MAXBW-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP13]], align 4
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -737,35 +701,27 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -811,15 +767,13 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
@@ -827,15 +781,13 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]])
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP25]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
@@ -843,15 +795,13 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]])
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP31]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
@@ -859,15 +809,13 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP41]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP45]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
@@ -917,35 +865,27 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -2055,12 +1995,10 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
@@ -2070,7 +2008,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
@@ -2084,16 +2022,14 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
@@ -2108,7 +2044,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
@@ -2121,12 +2057,10 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
@@ -2136,7 +2070,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 14a73db..5218e64 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -25,12 +25,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP16]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
@@ -62,20 +60,18 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP28]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
@@ -111,12 +107,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
@@ -173,11 +167,9 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP11]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[TMP14]], [[VEC_PHI]]
@@ -212,19 +204,17 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
@@ -263,11 +253,9 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP11]]
 ; CHECK-MAXBW-NEXT:    [[TMP15]] = add <vscale x 8 x i64> [[TMP14]], [[VEC_PHI]]
@@ -330,11 +318,9 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[TMP4]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[NEXT_GEP2]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i16>, ptr [[TMP6]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP2]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD3]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP15]], [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[TMP16]], [[VEC_PHI]]
@@ -373,19 +359,17 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[TMP4]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i16>, ptr [[TMP30]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i16>, ptr [[TMP8]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP3]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 2 x i16>, ptr [[TMP21]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD5]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD6]] to <vscale x 2 x i64>
@@ -428,11 +412,9 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP12]], align 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[NEXT_GEP2]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP2]], align 2
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD3]] to <vscale x 4 x i64>
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP15]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP17]] = add <vscale x 4 x i64> [[TMP16]], [[VEC_PHI]]
@@ -496,8 +478,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
@@ -555,7 +536,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
@@ -601,9 +582,8 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -715,7 +695,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
@@ -744,8 +724,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
 ; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
 ; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
@@ -803,7 +782,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]])
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -851,12 +830,10 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
@@ -894,20 +871,18 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
@@ -953,12 +928,10 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
@@ -1019,12 +992,10 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
@@ -1107,12 +1078,10 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
@@ -1187,35 +1156,27 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
@@ -1267,19 +1228,17 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP56]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
@@ -1287,19 +1246,17 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP41]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
@@ -1307,19 +1264,17 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = mul nuw i64 [[TMP51]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = mul nuw i64 [[TMP57]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
@@ -1327,19 +1282,17 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = mul nuw i64 [[TMP67]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul nuw i64 [[TMP73]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
@@ -1395,35 +1348,27 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
-; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP32]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP38]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
-; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP46]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP52]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
-; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP60]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP16]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP66]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
@@ -1520,12 +1465,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
@@ -1565,12 +1508,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
@@ -1610,12 +1551,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP11]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP15]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
@@ -1671,12 +1610,10 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
@@ -1712,20 +1649,18 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
@@ -1765,12 +1700,10 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
@@ -1829,13 +1762,11 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
@@ -1868,21 +1799,19 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
@@ -1919,23 +1848,10 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP17]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -2155,8 +2071,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP3]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
@@ -2189,9 +2104,8 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
@@ -2234,8 +2148,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
@@ -2293,8 +2206,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
@@ -2327,9 +2239,8 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
@@ -2372,8 +2283,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
@@ -2440,11 +2350,9 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
@@ -2486,19 +2394,17 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
@@ -2544,11 +2450,9 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]]
 ; CHECK-MAXBW-NEXT:    [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
@@ -2637,8 +2541,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
@@ -2738,8 +2641,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
@@ -2839,8 +2741,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
index 3515365..e24b47d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
@@ -20,9 +20,8 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
 ; IC2-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], %[[VECTOR_BODY]] ]
 ; IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
-; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; IC2-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; IC2-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; IC2-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -74,11 +73,10 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], %[[VECTOR_BODY]] ]
 ; IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], %[[VECTOR_BODY]] ]
 ; IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 32
 ; IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 48
-; IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
index 09d09e2..ae33e46 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
@@ -16,16 +16,14 @@ define i32 @not_dotp(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
@@ -39,7 +37,7 @@ define i32 @not_dotp(ptr %a, ptr %b) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 false, [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -69,7 +67,7 @@ for.exit:                        ; preds = %for.body
 define i40 @partial_reduce_not_known_factor(i32 %a, i32 %b, i16 %N) {
 ; CHECK-LABEL: define i40 @partial_reduce_not_known_factor(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i16 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 0)
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i16 [[SMAX]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[TMP0]], 1
@@ -100,7 +98,9 @@ define i40 @partial_reduce_not_known_factor(i32 %a, i32 %b, i16 %N) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <2 x i40> [[TMP8]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i40 @llvm.vector.reduce.or.v2i40(<2 x i40> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index a471c00..a46340c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -25,12 +25,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP13]]
@@ -63,20 +61,18 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
@@ -114,12 +110,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP13]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 63eb97a..d2c03d1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -19,8 +19,7 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3]] = add <16 x i32> [[TMP2]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -28,7 +27,7 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @zext_add_reduc_i8_i32_sve(
@@ -42,9 +41,8 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -56,7 +54,7 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @zext_add_reduc_i8_i32_sve(
@@ -77,8 +75,7 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -119,8 +116,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -128,7 +124,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @zext_add_reduc_i8_i32_neon(
@@ -142,9 +138,8 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -156,7 +151,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @zext_add_reduc_i8_i32_neon(
@@ -169,8 +164,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -178,7 +172,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -211,8 +205,7 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -220,7 +213,7 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i64 @zext_add_reduc_i8_i64(
@@ -234,9 +227,8 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
@@ -248,7 +240,7 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i64 @zext_add_reduc_i8_i64(
@@ -269,8 +261,7 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -312,8 +303,7 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = add <8 x i64> [[TMP3]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -321,7 +311,7 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP4]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i64 @zext_add_reduc_i16_i64(
@@ -335,9 +325,8 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
@@ -349,7 +338,7 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i64 @zext_add_reduc_i16_i64(
@@ -370,8 +359,7 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP8]], align 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP7]], align 2
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x i64>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -413,8 +401,7 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -422,7 +409,7 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(
@@ -436,9 +423,8 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -450,7 +436,7 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(
@@ -471,8 +457,7 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -524,8 +509,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP7]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -559,8 +543,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP7]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[VEC_PHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -594,8 +577,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP8]], <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
@@ -721,8 +703,7 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = sub <16 x i32> [[VEC_PHI]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -730,7 +711,7 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(
@@ -744,9 +725,8 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -758,7 +738,7 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(
@@ -779,8 +759,7 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP10]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -822,8 +801,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -831,7 +809,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @sext_add_reduc_i8_i32(
@@ -845,9 +823,8 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
@@ -859,7 +836,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @sext_add_reduc_i8_i32(
@@ -880,8 +857,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -933,8 +909,7 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -966,9 +941,8 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
-; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT]]
@@ -1008,8 +982,7 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP9]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1060,8 +1033,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1093,9 +1065,8 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
-; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]]
@@ -1135,8 +1106,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP8]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 492ab563..08d35f7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -353,11 +353,10 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; DEFAULT-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP13]], i32 0
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[TMP13]], i64 [[TMP17]]
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP15]], align 2
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP13]], align 2
 ; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP18]], align 2
 ; DEFAULT-NEXT:    [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]]
@@ -412,11 +411,10 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; VSCALEFORTUNING2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; VSCALEFORTUNING2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; VSCALEFORTUNING2-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
-; VSCALEFORTUNING2-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
 ; VSCALEFORTUNING2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; VSCALEFORTUNING2-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
 ; VSCALEFORTUNING2-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i64 [[TMP11]]
-; VSCALEFORTUNING2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP9]], align 2
+; VSCALEFORTUNING2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
 ; VSCALEFORTUNING2-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP12]], align 2
 ; VSCALEFORTUNING2-NEXT:    [[TMP13:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP14:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]]
@@ -476,8 +474,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
-; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP14]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; PRED-NEXT:    [[TMP21:%.*]] = or <vscale x 8 x i16> [[TMP20]], [[VEC_PHI]]
 ; PRED-NEXT:    [[TMP16]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP21]], <vscale x 8 x i16> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index 0f2eae1..eb3d724 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -41,11 +41,9 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[IDX]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[A:%.*]], <vscale x 2 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 2 x ptr> [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr double, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x double> [[WIDE_LOAD]], ptr [[TMP19]], align 8
+; CHECK-NEXT:    store <vscale x 2 x double> [[WIDE_LOAD]], ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[DOTSPLAT2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
index 7294452..d751d39 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
@@ -25,13 +25,11 @@ define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw double, ptr [[IN]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = fpext <4 x double> [[WIDE_LOAD]] to <4 x fp128>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[OUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <4 x fp128> [[TMP3]] to <4 x float>
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
index 1b489dd..011b823 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -21,12 +21,10 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]]
@@ -107,12 +105,10 @@ define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 0x47EFFFFFE0000000)
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index d73cdc1..a60d35d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -47,33 +47,32 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP7]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-UNORDERED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP8]])
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP7]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP11]], [[SUM_07]]
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP10]], [[SUM_07]]
 ; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
 ;
 ; CHECK-ORDERED-LABEL: define float @fadd_strict
@@ -93,32 +92,31 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-ORDERED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP10]], [[SUM_07]]
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP9]], [[SUM_07]]
 ; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict
@@ -144,17 +142,16 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP11]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP12]])
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP10]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP11]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 8 x i1> [[TMP14]], i32 0
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 8 x i1> [[TMP13]], i32 0
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK-ORDERED-TF:       scalar.ph:
@@ -165,13 +162,13 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD]] = fadd float [[TMP16]], [[SUM_07]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[ADD]] = fadd float [[TMP15]], [[SUM_07]]
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[ADD_LCSSA]]
 ;
 
@@ -230,54 +227,53 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP17]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT:    [[TMP18]] = fadd <vscale x 8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT:    [[TMP19]] = fadd <vscale x 8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT:    [[TMP20]] = fadd <vscale x 8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP14]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP12]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP15]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP16]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-UNORDERED-NEXT:    [[TMP17]] = fadd <vscale x 8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
+; CHECK-UNORDERED-NEXT:    [[TMP18]] = fadd <vscale x 8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
+; CHECK-UNORDERED-NEXT:    [[TMP19]] = fadd <vscale x 8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-UNORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP18]], [[TMP17]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd <vscale x 8 x float> [[TMP19]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd <vscale x 8 x float> [[TMP20]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX8]])
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP17]], [[TMP16]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX7:%.*]] = fadd <vscale x 8 x float> [[TMP18]], [[BIN_RDX]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX8:%.*]] = fadd <vscale x 8 x float> [[TMP19]], [[BIN_RDX7]]
+; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX8]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP23]], [[SUM_07]]
+; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-UNORDERED-NEXT:    [[ADD]] = fadd float [[TMP22]], [[SUM_07]]
 ; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    ret float [[ADD_LCSSA]]
 ;
 ; CHECK-ORDERED-LABEL: define float @fadd_strict_unroll
@@ -297,47 +293,46 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP13]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP17]], <vscale x 8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT:    [[TMP20]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD3]])
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 24
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP14]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP12]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP15]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
+; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP16]], <vscale x 8 x float> [[WIDE_LOAD1]])
+; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP17]], <vscale x 8 x float> [[WIDE_LOAD2]])
+; CHECK-ORDERED-NEXT:    [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD3]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-ORDERED-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP22]], [[SUM_07]]
+; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-NEXT:    [[ADD]] = fadd float [[TMP21]], [[SUM_07]]
 ; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[ADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fadd_strict_unroll
@@ -378,47 +373,46 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP19]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP22]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP25]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP20]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP23]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP27]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP28]], <vscale x 8 x float> [[TMP29]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP30]], <vscale x 8 x float> [[TMP31]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP34]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP32]], <vscale x 8 x float> [[TMP33]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP21]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP24]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP25]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP26]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP27]], <vscale x 8 x float> [[TMP28]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP29]], <vscale x 8 x float> [[TMP30]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP33]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP31]], <vscale x 8 x float> [[TMP32]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = mul nuw i64 [[TMP38]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = add i64 [[INDEX]], [[TMP39]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = mul nuw i64 [[TMP41]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = mul nuw i64 [[TMP34]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = add i64 [[INDEX]], [[TMP35]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = add i64 [[INDEX]], [[TMP38]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = mul nuw i64 [[TMP40]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = add i64 [[INDEX]], [[TMP41]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP37]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP40]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP43]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = extractelement <vscale x 8 x i1> [[TMP44]], i32 0
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP45]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT12]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP36]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT13]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP39]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP42]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = extractelement <vscale x 8 x i1> [[TMP43]], i32 0
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK-ORDERED-TF:       scalar.ph:
@@ -429,13 +423,13 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD]] = fadd float [[TMP46]], [[SUM_07]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[ADD]] = fadd float [[TMP45]], [[SUM_07]]
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[ADD_LCSSA]]
 ;
 
@@ -781,40 +775,38 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT:    [[TMP12]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-UNORDERED-NEXT:    [[TMP10]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-UNORDERED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP12]])
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP10]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-UNORDERED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP13]], [[TMP14]]
 ; CHECK-UNORDERED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
 ; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-UNORDERED:       for.end.loopexit:
-; CHECK-UNORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_END]]
 ; CHECK-UNORDERED:       for.end:
 ; CHECK-UNORDERED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
@@ -842,39 +834,37 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-ORDERED-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-ORDERED-NEXT:    [[TMP10]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP9]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-ORDERED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-ORDERED-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-ORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP14]], [[TMP15]]
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-ORDERED-NEXT:    [[ADD:%.*]] = fadd float [[TMP12]], [[TMP13]]
 ; CHECK-ORDERED-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
 ; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-ORDERED:       for.end.loopexit:
-; CHECK-ORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_END]]
 ; CHECK-ORDERED:       for.end:
 ; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
@@ -908,21 +898,19 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP15]], <vscale x 4 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP16]])
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP13]], <vscale x 4 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP14]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK-ORDERED-TF:       scalar.ph:
@@ -933,16 +921,16 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD:%.*]] = fadd float [[TMP20]], [[TMP21]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[ADD:%.*]] = fadd float [[TMP18]], [[TMP19]]
 ; CHECK-ORDERED-TF-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end.loopexit:
-; CHECK-ORDERED-TF-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END]]
 ; CHECK-ORDERED-TF:       for.end:
 ; CHECK-ORDERED-TF-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
@@ -1019,46 +1007,44 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> splat (float -0.000000e+00), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> splat (float -0.000000e+00), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
-; CHECK-UNORDERED-NEXT:    [[TMP11]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> poison)
+; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
+; CHECK-UNORDERED-NEXT:    [[TMP9]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-UNORDERED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP11]])
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP9]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-UNORDERED-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP14]], 0.000000e+00
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-UNORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP12]], 0.000000e+00
 ; CHECK-UNORDERED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK-UNORDERED:       if.then:
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_INC]]
 ; CHECK-UNORDERED:       for.inc:
-; CHECK-UNORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP15]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP13]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
 ; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    ret float [[RDX]]
 ;
 ; CHECK-ORDERED-LABEL: define float @fadd_conditional
@@ -1078,45 +1064,43 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
-; CHECK-ORDERED-NEXT:    [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
+; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
+; CHECK-ORDERED-NEXT:    [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-ORDERED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-ORDERED-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP13]], 0.000000e+00
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP11]], 0.000000e+00
 ; CHECK-ORDERED-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK-ORDERED:       if.then:
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-ORDERED-NEXT:    br label [[FOR_INC]]
 ; CHECK-ORDERED:       for.inc:
-; CHECK-ORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP14]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[PHI:%.*]] = phi float [ [[TMP12]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
 ; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[RDX]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fadd_conditional
@@ -1142,23 +1126,21 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF:       vector.body:
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[TMP14]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP13]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> splat (float 3.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP16]])
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> zeroinitializer
+; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> splat (float 3.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP15]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP14]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK-ORDERED-TF:       scalar.ph:
@@ -1169,21 +1151,21 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP20]], 0.000000e+00
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 0.000000e+00
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK-ORDERED-TF:       if.then:
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_INC]]
 ; CHECK-ORDERED-TF:       for.inc:
-; CHECK-ORDERED-TF-NEXT:    [[PHI:%.*]] = phi float [ [[TMP21]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[PHI:%.*]] = phi float [ [[TMP19]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[RDX]]
 ;
 
@@ -1257,40 +1239,38 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b,
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = fadd <vscale x 8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP11]] = fadd <vscale x 8 x float> [[TMP8]], [[WIDE_LOAD1]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = fadd <vscale x 8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP8]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP9]] = fadd <vscale x 8 x float> [[TMP7]], [[WIDE_LOAD1]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-UNORDERED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP11]])
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP9]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ -0.000000e+00, [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ -0.000000e+00, [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP14]]
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-UNORDERED-NEXT:    [[ADD:%.*]] = fadd float [[SUM]], [[TMP12]]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP15]]
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-UNORDERED-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP13]]
 ; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    ret float [[RDX]]
 ;
 ; CHECK-ORDERED-LABEL: define float @fadd_multiple
@@ -1396,71 +1376,69 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP28]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP29]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP30]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP31]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP14]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP12]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP15]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP18]]
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP21]]
+; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP24]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP19]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP22]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP25]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP26]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED-NEXT:    [[TMP27]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED-NEXT:    [[TMP28]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED-NEXT:    [[TMP29]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-UNORDERED-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP29]], [[TMP28]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd <vscale x 8 x float> [[TMP30]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd <vscale x 8 x float> [[TMP31]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP27]], [[TMP26]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd <vscale x 8 x float> [[TMP28]], [[BIN_RDX]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd <vscale x 8 x float> [[TMP29]], [[BIN_RDX11]]
+; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[SUM_07]])
+; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP32]], float [[TMP33]], float [[SUM_07]])
 ; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 ; CHECK-ORDERED-LABEL: define float @fmuladd_strict
@@ -1480,68 +1458,66 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP13]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP28]])
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP32]], <vscale x 8 x float> [[TMP29]])
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP33]], <vscale x 8 x float> [[TMP30]])
-; CHECK-ORDERED-NEXT:    [[TMP35]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], <vscale x 8 x float> [[TMP31]])
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 24
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP14]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP12]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP15]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP18]]
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP21]]
+; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 24
+; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP24]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP19]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP22]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP25]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP26]])
+; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP30]], <vscale x 8 x float> [[TMP27]])
+; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP31]], <vscale x 8 x float> [[TMP28]])
+; CHECK-ORDERED-NEXT:    [[TMP33]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP32]], <vscale x 8 x float> [[TMP29]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-ORDERED-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[SUM_07]])
+; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP35]], float [[TMP36]], float [[SUM_07]])
 ; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict
@@ -1582,66 +1558,64 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP19]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP22]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP25]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP20]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP23]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP30]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul nuw i64 [[TMP32]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP33]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP28]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP31]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP34]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP37]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP38]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP42]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[TMP39]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP21]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP24]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP25]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP28]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP31]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP34]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP29]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP32]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP35]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP36]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP40]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[TMP37]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP41]], <vscale x 8 x float> [[TMP42]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[TMP38]], <vscale x 8 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP43]], <vscale x 8 x float> [[TMP44]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[TMP40]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP45]], <vscale x 8 x float> [[TMP46]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP41]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP49]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP47]], <vscale x 8 x float> [[TMP48]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP39]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP47]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP45]], <vscale x 8 x float> [[TMP46]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = mul nuw i64 [[TMP50]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX]], [[TMP51]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = mul nuw i64 [[TMP53]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = add i64 [[INDEX]], [[TMP54]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = mul nuw i64 [[TMP56]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = add i64 [[INDEX]], [[TMP57]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = mul nuw i64 [[TMP48]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = add i64 [[INDEX]], [[TMP49]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = mul nuw i64 [[TMP51]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX]], [[TMP52]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = mul nuw i64 [[TMP54]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = add i64 [[INDEX]], [[TMP55]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP52]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP55]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP58]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = extractelement <vscale x 8 x i1> [[TMP59]], i32 0
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP50]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP56]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = extractelement <vscale x 8 x i1> [[TMP57]], i32 0
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK-ORDERED-TF:       scalar.ph:
@@ -1652,15 +1626,15 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP61]], float [[TMP62]], float [[SUM_07]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP59]], float [[TMP60]], float [[SUM_07]])
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP47]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 
@@ -1724,71 +1698,69 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-UNORDERED:       vector.body:
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-UNORDERED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
-; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
-; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
-; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
-; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
-; CHECK-UNORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT:    [[TMP28]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT:    [[TMP29]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT:    [[TMP30]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT:    [[TMP31]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP11]]
+; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP14]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP12]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP15]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-UNORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+; CHECK-UNORDERED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP18]]
+; CHECK-UNORDERED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-UNORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP21]]
+; CHECK-UNORDERED-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 24
+; CHECK-UNORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP24]]
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, ptr [[TMP19]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, ptr [[TMP22]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, ptr [[TMP25]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP26]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED-NEXT:    [[TMP27]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED-NEXT:    [[TMP28]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED-NEXT:    [[TMP29]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-UNORDERED-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-UNORDERED-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-UNORDERED:       middle.block:
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[TMP29]], [[TMP28]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd nnan <vscale x 8 x float> [[TMP30]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd nnan <vscale x 8 x float> [[TMP31]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[TMP27]], [[TMP26]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX11:%.*]] = fadd nnan <vscale x 8 x float> [[TMP28]], [[BIN_RDX]]
+; CHECK-UNORDERED-NEXT:    [[BIN_RDX12:%.*]] = fadd nnan <vscale x 8 x float> [[TMP29]], [[BIN_RDX11]]
+; CHECK-UNORDERED-NEXT:    [[TMP31:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
 ; CHECK-UNORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-UNORDERED:       scalar.ph:
 ; CHECK-UNORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-UNORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-UNORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-UNORDERED:       for.body:
 ; CHECK-UNORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP32:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[SUM_07]])
+; CHECK-UNORDERED-NEXT:    [[TMP33:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-UNORDERED-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP32]], float [[TMP33]], float [[SUM_07]])
 ; CHECK-UNORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-UNORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-UNORDERED:       for.end:
-; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-UNORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
 ; CHECK-UNORDERED-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 ; CHECK-ORDERED-LABEL: define float @fmuladd_strict_fmf
@@ -1808,68 +1780,66 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-ORDERED:       vector.body:
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 8
-; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 16
-; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP12]]
-; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 24
-; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP15]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP7]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP13]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 8
-; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
-; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP23]]
-; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = mul nuw i64 [[TMP25]], 24
-; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP26]]
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP18]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP21]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP24]], align 4
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP27]], align 4
-; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP28]])
-; CHECK-ORDERED-NEXT:    [[TMP33:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP32]], <vscale x 8 x float> [[TMP29]])
-; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP33]], <vscale x 8 x float> [[TMP30]])
-; CHECK-ORDERED-NEXT:    [[TMP35]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], <vscale x 8 x float> [[TMP31]])
+; CHECK-ORDERED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
+; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 16
+; CHECK-ORDERED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP11]]
+; CHECK-ORDERED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 24
+; CHECK-ORDERED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP14]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, ptr [[TMP12]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, ptr [[TMP15]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-ORDERED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+; CHECK-ORDERED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP18]]
+; CHECK-ORDERED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-ORDERED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP21]]
+; CHECK-ORDERED-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 24
+; CHECK-ORDERED-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP24]]
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, ptr [[TMP16]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, ptr [[TMP19]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, ptr [[TMP22]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, ptr [[TMP25]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP26:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED-NEXT:    [[TMP27:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED-NEXT:    [[TMP28:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED-NEXT:    [[TMP29:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED-NEXT:    [[TMP30:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP26]])
+; CHECK-ORDERED-NEXT:    [[TMP31:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP30]], <vscale x 8 x float> [[TMP27]])
+; CHECK-ORDERED-NEXT:    [[TMP32:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP31]], <vscale x 8 x float> [[TMP28]])
+; CHECK-ORDERED-NEXT:    [[TMP33]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP32]], <vscale x 8 x float> [[TMP29]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-ORDERED-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-ORDERED-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-ORDERED:       middle.block:
 ; CHECK-ORDERED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-ORDERED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK-ORDERED:       scalar.ph:
 ; CHECK-ORDERED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-ORDERED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-ORDERED-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-ORDERED:       for.body:
 ; CHECK-ORDERED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP37:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP35:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT:    [[TMP38:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[SUM_07]])
+; CHECK-ORDERED-NEXT:    [[TMP36:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP35]], float [[TMP36]], float [[SUM_07]])
 ; CHECK-ORDERED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-ORDERED:       for.end:
-; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 ; CHECK-ORDERED-TF-LABEL: define float @fmuladd_strict_fmf
@@ -1910,66 +1880,64 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK6:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY3]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK7:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
+; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP19]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP22]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP25]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP17]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP20]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP23]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP30]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = mul nuw i64 [[TMP32]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP33]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = mul nuw i64 [[TMP35]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP27]], i64 [[TMP36]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP28]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP31]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP34]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP37]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP38]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP42]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[TMP39]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP18]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP21]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP24]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP25]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP27]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP28]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP31]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP26]], i64 [[TMP34]]
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP29]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP32]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP35]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[TMP36:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP37:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP38:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP39:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD11]], [[WIDE_MASKED_LOAD15]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[TMP36]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP40]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[TMP37]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP43:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP41]], <vscale x 8 x float> [[TMP42]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP44:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[TMP38]], <vscale x 8 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP45:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP43]], <vscale x 8 x float> [[TMP44]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> [[TMP40]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP47:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP45]], <vscale x 8 x float> [[TMP46]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP41]], <vscale x 8 x float> splat (float -0.000000e+00)
-; CHECK-ORDERED-TF-NEXT:    [[TMP49]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP47]], <vscale x 8 x float> [[TMP48]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP46:%.*]] = select nnan <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> [[TMP39]], <vscale x 8 x float> splat (float -0.000000e+00)
+; CHECK-ORDERED-TF-NEXT:    [[TMP47]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP45]], <vscale x 8 x float> [[TMP46]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = mul nuw i64 [[TMP50]], 8
-; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX]], [[TMP51]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = mul nuw i64 [[TMP53]], 16
-; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = add i64 [[INDEX]], [[TMP54]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = mul nuw i64 [[TMP56]], 24
-; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = add i64 [[INDEX]], [[TMP57]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP48:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP49:%.*]] = mul nuw i64 [[TMP48]], 8
+; CHECK-ORDERED-TF-NEXT:    [[TMP50:%.*]] = add i64 [[INDEX]], [[TMP49]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP52:%.*]] = mul nuw i64 [[TMP51]], 16
+; CHECK-ORDERED-TF-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX]], [[TMP52]]
+; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = mul nuw i64 [[TMP54]], 24
+; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = add i64 [[INDEX]], [[TMP55]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP52]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP55]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP58]], i64 [[TMP9]])
-; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = extractelement <vscale x 8 x i1> [[TMP59]], i32 0
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP50]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT17]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP56]], i64 [[TMP9]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-ORDERED-TF-NEXT:    [[TMP58:%.*]] = extractelement <vscale x 8 x i1> [[TMP57]], i32 0
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK-ORDERED-TF:       scalar.ph:
@@ -1980,15 +1948,15 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP61:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[TMP59:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP62:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP61]], float [[TMP62]], float [[SUM_07]])
+; CHECK-ORDERED-TF-NEXT:    [[TMP60:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-ORDERED-TF-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP59]], float [[TMP60]], float [[SUM_07]])
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ]
+; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP47]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[MULADD_LCSSA]]
 ;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 3bb20e2..eaf85694 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -30,11 +30,9 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
@@ -114,8 +112,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <2 x i64> [[VEC_IND]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
@@ -126,14 +123,14 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -189,8 +186,7 @@ define i64 @loop_contains_safe_call() #1 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast oge <4 x float> [[TMP3]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
@@ -201,14 +197,14 @@ define i64 @loop_contains_safe_call() #1 {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -276,8 +272,7 @@ define i64 @loop_contains_safe_div() #1 {
 ; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = add i64 3, [[INDEX2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = udiv <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 20000)
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <vscale x 4 x i32> [[TMP13]], splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX2]], [[TMP5]]
@@ -356,12 +351,10 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
@@ -371,14 +364,14 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_LOAD2]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -457,11 +450,9 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index 61ef3ce..070f658 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -14,40 +14,88 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 510, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 510, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 510, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 64
 ; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 3, [[N_VEC]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP11]], 32
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 48
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP15]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 32
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 48
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i64 [[TMP25]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD5]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP34:%.*]] = or <vscale x 16 x i1> [[TMP32]], [[TMP30]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <vscale x 16 x i1> [[TMP34]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <vscale x 16 x i1> [[TMP37]], [[TMP59]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP36:%.*]] = or i1 [[TMP12]], [[TMP35]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 16
+; CHECK-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 true)
+; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP40]], 3
+; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
+; CHECK-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP40]], 2
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], [[TMP44]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP44]], [[TMP40]]
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
+; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP40]], 1
+; CHECK-NEXT:    [[TMP51:%.*]] = add i64 [[TMP50]], [[TMP49]]
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne i64 [[TMP49]], [[TMP40]]
+; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i64 [[TMP51]], i64 [[TMP48]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
+; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP40]], 0
+; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP55]], [[TMP61]]
+; CHECK-NEXT:    [[TMP57:%.*]] = icmp ne i64 [[TMP61]], [[TMP40]]
+; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], i64 [[TMP56]], i64 [[TMP53]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP58]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index 77b768e..51efbe9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -29,11 +29,10 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 16
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP23]]
-; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP9]], align 1
 ; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP24]], align 1
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -59,8 +58,7 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; DEFAULT:       vec.epilog.vector.body:
 ; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX5]]
-; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
-; DEFAULT-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP20]], align 1
+; DEFAULT-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP19]], align 1
 ; DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP17]]
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
 ; DEFAULT-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -104,8 +102,7 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
 ; PRED-NEXT:    [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -169,9 +166,8 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
-; DEFAULT-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
+; DEFAULT-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP10]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
 ; DEFAULT-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
@@ -194,8 +190,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8>
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = and <8 x i8> [[TMP18]], [[TMP15]]
 ; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX5]]
-; DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; DEFAULT-NEXT:    store <8 x i8> [[TMP14]], ptr [[TMP27]], align 1, !alias.scope [[META8]], !noalias [[META5]]
+; DEFAULT-NEXT:    store <8 x i8> [[TMP14]], ptr [[TMP26]], align 1, !alias.scope [[META8]], !noalias [[META5]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 8
 ; DEFAULT-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1000
 ; DEFAULT-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -252,8 +247,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-NEXT:    [[TMP8:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT3]] to <vscale x 2 x i8>
 ; PRED-NEXT:    [[TMP9:%.*]] = and <vscale x 2 x i8> [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP9]], ptr [[TMP6]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP9]], ptr [[TMP5]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1000)
 ; PRED-NEXT:    [[TMP12:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
index 045f1c4..1213d97 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
@@ -27,20 +27,17 @@ define i32 @foo(i32 noundef %n, i32 noundef %lag, i32 noundef %shift) vscale_ran
 ; SC_SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; SC_SVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SC_SVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @a, i64 0, i64 [[INDEX]]
-; SC_SVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
-; SC_SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
+; SC_SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; SC_SVE-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
 ; SC_SVE-NEXT:    [[TMP5:%.*]] = ashr <4 x i32> [[TMP4]], [[VEC_IND]]
 ; SC_SVE-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDEX]], [[TMP0]]
 ; SC_SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i16], ptr @b, i64 0, i64 [[TMP6]]
-; SC_SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; SC_SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP8]], align 2
+; SC_SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2
 ; SC_SVE-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[WIDE_LOAD1]] to <4 x i32>
 ; SC_SVE-NEXT:    [[TMP10:%.*]] = shl <4 x i32> [[TMP9]], [[VEC_IND]]
 ; SC_SVE-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP5]]
 ; SC_SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i16], ptr @c, i64 0, i64 [[INDEX]]
-; SC_SVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i32 0
-; SC_SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP13]], align 2
+; SC_SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP12]], align 2
 ; SC_SVE-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[WIDE_LOAD2]] to <4 x i32>
 ; SC_SVE-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP14]]
 ; SC_SVE-NEXT:    [[TMP16:%.*]] = shl <4 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
@@ -101,20 +98,17 @@ define i32 @foo(i32 noundef %n, i32 noundef %lag, i32 noundef %shift) vscale_ran
 ; NO_SC_SVE-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; NO_SC_SVE-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO_SC_SVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @a, i64 0, i64 [[INDEX]]
-; NO_SC_SVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
-; NO_SC_SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
+; NO_SC_SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
 ; NO_SC_SVE-NEXT:    [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
 ; NO_SC_SVE-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP4]], [[VEC_IND]]
 ; NO_SC_SVE-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDEX]], [[TMP0]]
 ; NO_SC_SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i16], ptr @b, i64 0, i64 [[TMP6]]
-; NO_SC_SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; NO_SC_SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP8]], align 2
+; NO_SC_SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
 ; NO_SC_SVE-NEXT:    [[TMP9:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32>
 ; NO_SC_SVE-NEXT:    [[TMP10:%.*]] = shl <8 x i32> [[TMP9]], [[VEC_IND]]
 ; NO_SC_SVE-NEXT:    [[TMP11:%.*]] = mul nsw <8 x i32> [[TMP10]], [[TMP5]]
 ; NO_SC_SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i16], ptr @c, i64 0, i64 [[INDEX]]
-; NO_SC_SVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i32 0
-; NO_SC_SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2
+; NO_SC_SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP12]], align 2
 ; NO_SC_SVE-NEXT:    [[TMP14:%.*]] = sext <8 x i16> [[WIDE_LOAD2]] to <8 x i32>
 ; NO_SC_SVE-NEXT:    [[TMP15:%.*]] = add nsw <8 x i32> [[TMP11]], [[TMP14]]
 ; NO_SC_SVE-NEXT:    [[TMP16:%.*]] = shl <8 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 8399614..f4eebca 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -970,7 +970,7 @@ loop:
   %red.next = fadd double %for, %red
   %for.next = sitofp i32 %iv to double
   %iv.next = add nsw i32 %iv, 1
-  %ec = icmp eq i32 %iv.next, 1024
+  %ec = icmp eq i32 %iv.next, 1025
   br i1 %ec, label %exit, label %loop, !llvm.loop !13
 
 exit:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index 0fddadd..3d81541 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -27,11 +27,10 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]]
@@ -58,8 +57,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP24]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]])
 ; CHECK-NEXT:    [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index de8fcb0..af9c39e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -27,11 +27,10 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ insertelement (<vscale x 2 x i64> zeroinitializer, i64 5, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
@@ -58,8 +57,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <2 x i64> [ [[TMP22]], [[VEC_EPILOG_PH]] ], [ [[TMP26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP24]], align 8
 ; CHECK-NEXT:    [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
index 83f2b2a..fca29cd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
@@ -26,11 +26,10 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP18]], <vscale x 4 x float> [[WIDE_LOAD2]])
@@ -54,8 +53,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[TMP24]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI7]], <2 x float> [[WIDE_LOAD8]])
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 5209374..0f407cd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -41,11 +41,10 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 16
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP18]]
-; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP16]], align 1
+; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP14]], align 1
 ; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -71,8 +70,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
-; CHECK-NEXT:    store <vscale x 8 x i8> splat (i8 1), ptr [[TMP29]], align 1
+; CHECK-NEXT:    store <vscale x 8 x i8> splat (i8 1), ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -103,11 +101,10 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 16
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP14]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP12]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP17]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF8-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -125,8 +122,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-VF8:       vec.epilog.vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-VF8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0
-; CHECK-VF8-NEXT:    store <8 x i8> splat (i8 1), ptr [[TMP21]], align 1
+; CHECK-VF8-NEXT:    store <8 x i8> splat (i8 1), ptr [[TMP20]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
 ; CHECK-VF8-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
 ; CHECK-VF8-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -152,6 +148,104 @@ exit:
   ret void
 }
 
+define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
+; CHECK-LABEL: @main_vf_vscale_x_2_no_epi_iteration(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+;
+; CHECK-VF8-LABEL: @main_vf_vscale_x_2_no_epi_iteration(
+; CHECK-VF8-NEXT:  iter.check:
+; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8:       vector.main.loop.iter.check:
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-VF8:       vec.epilog.iter.check:
+; CHECK-VF8-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8:       vec.epilog.ph:
+; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8:       vec.epilog.vector.body:
+; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP11]], align 1
+; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
+; CHECK-VF8-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
+; CHECK-VF8-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF8:       vec.epilog.middle.block:
+; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-VF8:       vec.epilog.scalar.ph:
+; CHECK-VF8-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-VF8-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF8:       for.body:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv
+  store i64 1, ptr %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 1024
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
 
 ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_2'
 ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
@@ -167,110 +261,114 @@ exit:
 ; fixed-width VF=8 for the epilogue if the vectors are known to be
 ; sufficiently wide. This information can be deduced from vscale_range or
 ; VScaleForTuning (set by mcpu/mtune).
-define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) {
+define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-LABEL: @main_vf_vscale_x_2(
 ; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP14]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP12]], align 1
 ; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[SCALAR_PH]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP21]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX4]]
+; CHECK-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[SCALAR_PH]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       for.body:
 ;
 ; CHECK-VF8-LABEL: @main_vf_vscale_x_2(
 ; CHECK-VF8-NEXT:  iter.check:
-; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vector.main.loop.iter.check:
 ; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
-; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-VF8:       vector.ph:
 ; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-VF8-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 2
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP14]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP12]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF8-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-VF8:       middle.block:
-; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
-; CHECK-VF8-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
 ; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK-VF8:       vec.epilog.ph:
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8
+; CHECK-VF8-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vec.epilog.vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
-; CHECK-VF8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; CHECK-VF8-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP21]], align 1
+; CHECK-VF8-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP20]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-VF8-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-VF8-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF8-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC3]]
+; CHECK-VF8-NEXT:    br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
-; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-VF8-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VF8:       vec.epilog.scalar.ph:
-; CHECK-VF8-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-VF8-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-VF8-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-VF8:       for.body:
 ;
@@ -282,7 +380,7 @@ for.body:
   %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv
   store i64 1, ptr %arrayidx, align 1
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp ne i64 %iv.next, 1024
+  %exitcond = icmp ne i64 %iv.next, %n
   br i1 %exitcond, label %for.body, label %exit
 
 exit:
@@ -314,15 +412,14 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 16
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP18]]
-; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP16]], align 1
+; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -346,11 +443,10 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
-; CHECK-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP29]], align 1
+; CHECK-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX7]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 10000, [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -379,15 +475,14 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
-; CHECK-VF8-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF8-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 16
 ; CHECK-VF8-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP12]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP17]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF8-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -403,11 +498,10 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8:       vec.epilog.vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-VF8-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX3]]
-; CHECK-VF8-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-VF8-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP21]], align 1
+; CHECK-VF8-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP20]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX3]], 8
 ; CHECK-VF8-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 10000
-; CHECK-VF8-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
 ; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VF8:       vec.epilog.scalar.ph:
@@ -432,4 +526,307 @@ exit:
   ret void
 }
 
+; Loop with vscale-based trip count vscale x 1033.
+define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 {
+; CHECK-LABEL: @trip_count_vscale(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1033
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP11]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP16]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP25]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 2
+; CHECK-NEXT:    [[N_MOD_VF5:%.*]] = urem i64 [[N]], [[TMP27]]
+; CHECK-NEXT:    [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]]
+; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 2 x float>, ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 2 x float>, ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul <vscale x 2 x float> [[WIDE_LOAD8]], [[WIDE_LOAD9]]
+; CHECK-NEXT:    store <vscale x 2 x float> [[TMP34]], ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], [[TMP29]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC6]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC6]]
+; CHECK-NEXT:    br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+;
+; CHECK-VF8-LABEL: @trip_count_vscale(
+; CHECK-VF8-NEXT:  entry:
+; CHECK-VF8-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1033
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-VF8-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP9]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4
+; CHECK-VF8-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4
+; CHECK-VF8-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP14]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
+; CHECK-VF8-NEXT:    [[TMP16:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF8-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-VF8-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; CHECK-VF8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP19]]
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP16]], ptr [[TMP11]], align 4
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP20]], align 4
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-VF8-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-VF8:       scalar.ph:
+; CHECK-VF8-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-VF8-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF8:       for.body:
+;
+entry:
+  %v = tail call i64 @llvm.vscale.i64()
+  %n = mul nuw nsw i64 %v, 1033
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv
+  %l.a = load float, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv
+  %l.b = load float, ptr %arrayidx3, align 4
+  %mul4 = fmul float %l.a, %l.b
+  store float %mul4, ptr %arrayidx3, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Loop with vscale-based trip count vscale x 1024.
+; TODO: No epilogue vectorizations should remain when choosing VF = vscale x 4.
+define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 {
+; CHECK-LABEL: @trip_count_vscale_no_epilogue_iterations(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1024
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP11]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP16]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP25]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 2
+; CHECK-NEXT:    [[N_MOD_VF5:%.*]] = urem i64 [[N]], [[TMP27]]
+; CHECK-NEXT:    [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]]
+; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 2 x float>, ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX7]]
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 2 x float>, ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul <vscale x 2 x float> [[WIDE_LOAD8]], [[WIDE_LOAD9]]
+; CHECK-NEXT:    store <vscale x 2 x float> [[TMP34]], ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], [[TMP29]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC6]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC6]]
+; CHECK-NEXT:    br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+;
+; CHECK-VF8-LABEL: @trip_count_vscale_no_epilogue_iterations(
+; CHECK-VF8-NEXT:  entry:
+; CHECK-VF8-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1024
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-VF8-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP9]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4
+; CHECK-VF8-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4
+; CHECK-VF8-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP14]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
+; CHECK-VF8-NEXT:    [[TMP16:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF8-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-VF8-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; CHECK-VF8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP19]]
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP16]], ptr [[TMP11]], align 4
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP20]], align 4
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-VF8-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-VF8:       scalar.ph:
+; CHECK-VF8-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-VF8-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF8:       for.body:
+;
+entry:
+  %v = tail call i64 @llvm.vscale.i64()
+  %n = mul nuw nsw i64 %v, 1024
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv
+  %l.a = load float, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv
+  %l.b = load float, ptr %arrayidx3, align 4
+  %mul4 = fmul float %l.a, %l.b
+  store float %mul4, ptr %arrayidx3, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
index 51e2492..20bc0af 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
@@ -28,21 +28,18 @@ define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA510:       [[VECTOR_BODY]]:
 ; CHECK-CA510-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-CA510-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]]
-; CHECK-CA510-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0
 ; CHECK-CA510-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4
-; CHECK-CA510-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-CA510-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-CA510-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-CA510-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]]
-; CHECK-CA510-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0
 ; CHECK-CA510-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4
-; CHECK-CA510-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CHECK-CA510-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
 ; CHECK-CA510-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; CHECK-CA510-NEXT:    [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]]
 ; CHECK-CA510-NEXT:    [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]]
 ; CHECK-CA510-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]]
-; CHECK-CA510-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0
 ; CHECK-CA510-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4
-; CHECK-CA510-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP12]], align 4
+; CHECK-CA510-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP11]], align 4
 ; CHECK-CA510-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP13]], align 4
 ; CHECK-CA510-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP2]], 8
 ; CHECK-CA510-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -95,21 +92,18 @@ define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA520:       [[VECTOR_BODY]]:
 ; CHECK-CA520-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-CA520-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]]
-; CHECK-CA520-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0
 ; CHECK-CA520-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4
-; CHECK-CA520-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-CA520-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-CA520-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-CA520-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]]
-; CHECK-CA520-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0
 ; CHECK-CA520-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4
-; CHECK-CA520-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CHECK-CA520-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
 ; CHECK-CA520-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; CHECK-CA520-NEXT:    [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]]
 ; CHECK-CA520-NEXT:    [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]]
 ; CHECK-CA520-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]]
-; CHECK-CA520-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0
 ; CHECK-CA520-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4
-; CHECK-CA520-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP12]], align 4
+; CHECK-CA520-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP11]], align 4
 ; CHECK-CA520-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP13]], align 4
 ; CHECK-CA520-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP2]], 8
 ; CHECK-CA520-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
index 0322f74..24f93f0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
@@ -37,20 +37,18 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP12]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP11]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x half>, ptr [[TMP15]], align 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = fneg <vscale x 8 x half> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = fneg <vscale x 8 x half> [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 8
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i64 [[TMP21]]
-; CHECK-NEXT:    store <vscale x 8 x half> [[TMP16]], ptr [[TMP19]], align 2
+; CHECK-NEXT:    store <vscale x 8 x half> [[TMP16]], ptr [[TMP18]], align 2
 ; CHECK-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP22]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 70042ca..fefb5af 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -37,11 +37,10 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = zext <vscale x 2 x i7> [[TMP19]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP24:%.*]] = zext <vscale x 2 x i7> [[TMP20]] to <vscale x 2 x i64>
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP27]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP23]], ptr [[TMP25]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP23]], ptr [[TMP21]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP24]], ptr [[TMP28]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[STEP_ADD]], [[DOTSPLAT]]
@@ -100,11 +99,10 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i3> [[STEP_ADD]] to <vscale x 2 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP25]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP19]], ptr [[TMP23]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP19]], ptr [[TMP21]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP20]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[STEP_ADD]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index f8c635b..f0675a4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -21,8 +21,7 @@ define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i32 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
@@ -73,8 +72,7 @@ define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index cac526f..2b4aad1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -30,11 +30,10 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[TMP30]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 2
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i64, ptr [[TMP30]], i64 [[TMP34]]
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP30]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
index 246beb2..993c048 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
@@ -39,18 +39,16 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP25]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD3]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -113,18 +111,16 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP27]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP25]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD3]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
index b6f723e..893ebef 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -51,34 +51,30 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP23]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP27]], i64 [[TMP29]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 8
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP33:%.*]] = mul nuw i64 [[TMP32]], 2
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP25]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP31]], i64 [[TMP33]]
 ; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD13]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD12]], [[WIDE_LOAD14]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[TMP37]], i32 0
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul nuw i64 [[TMP42]], 2
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[TMP37]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[TMP41]], i64 [[TMP43]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 8
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[TMP39]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul nuw i64 [[TMP46]], 2
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i64, ptr [[TMP45]], i64 [[TMP47]]
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index eb8f218..1cda568 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -67,8 +67,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
index 90b4901..fb0447b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
@@ -22,14 +22,12 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP10]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index 2de24b0..fc86e3a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -32,8 +32,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP14]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -86,8 +85,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-IN-LOOP-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]]
@@ -158,8 +156,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -211,8 +208,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-IN-LOOP-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP11]], i32 0
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-IN-LOOP-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
 ; CHECK-IN-LOOP-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -281,13 +277,11 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 7, i32 0), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 5)
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP20]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -348,13 +342,11 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 5)
 ; CHECK-IN-LOOP-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP17]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP19]] = xor i32 [[TMP18]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index ea9cd3f..4ec7d4d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -46,7 +46,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP53:%.*]] = mul nuw i64 [[TMP52]], 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
@@ -56,7 +55,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
@@ -138,7 +137,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK8:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK9:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY5]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[COND_PTR:%.*]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP53:%.*]] = mul nuw i64 [[TMP52]], 4
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]]
@@ -148,7 +146,7 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
@@ -161,7 +159,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP71:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i1> [[TMP63]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP72:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i1> [[TMP64]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
 ; CHECK-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP75:%.*]] = mul nuw i64 [[TMP74]], 4
 ; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]]
@@ -171,7 +168,7 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP80:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP81:%.*]] = mul nuw i64 [[TMP80]], 12
 ; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, <vscale x 4 x i1> [[TMP69]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP65]], i32 4, <vscale x 4 x i1> [[TMP69]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index f6f8895..672523e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -31,8 +31,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -78,8 +77,7 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -130,11 +128,9 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -253,8 +249,7 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
@@ -317,8 +312,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -376,15 +370,13 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -450,8 +442,7 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
@@ -506,12 +497,10 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -568,13 +557,11 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -622,8 +609,7 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
index e2c7469..3e9f6fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vscale-based-trip-counts.ll
@@ -24,25 +24,23 @@ define void @vscale_mul_4(ptr noalias noundef readonly captures(none) %a, ptr no
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP18]], 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP11]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP26]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP27]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP21]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP19]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP28]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -102,25 +100,22 @@ define  void @vscale_mul_8(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[MUL1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP15]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[B]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP21]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[B]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[MUL1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY:.*]]
@@ -180,25 +175,23 @@ define void @vscale_mul_12(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP25]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -265,25 +258,23 @@ define void @vscale_mul_31(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -350,25 +341,23 @@ define void @vscale_mul_64(ptr noalias noundef readonly captures(none) %a, ptr n
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP15]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[TMP12]], i64 [[TMP20]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP18]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index e58ea65..0754a38 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -9,38 +9,6 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK-NOT:    LV: Found {{.*}} scalar instruction:   %ptr.iv.2.next = getelementptr inbounds i8, ptr %ptr.iv.2, i64 1
 ;
 ; CHECK:        VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' {
-; CHECK-NEXT:   Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT:   Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:   Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:   Live-in ir<%N> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT:  vp<[[END1:%.+]]> = DERIVED-IV ir<%start.1> + vp<[[VEC_TC]]> * ir<8>
-; CHECK-NEXT:  vp<[[END2:%.+]]> = DERIVED-IV ir<%start.2> + vp<[[VEC_TC]]> * ir<1>
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT:   <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:     EMIT ir<%ptr.iv.2> = WIDEN-POINTER-INDUCTION ir<%start.2>, ir<1>
-; CHECK-NEXT:     vp<[[PTR_IDX:%.+]]> = DERIVED-IV ir<0> + vp<[[CAN_IV]]> * ir<8>
-; CHECK-NEXT:     vp<[[PTR_IDX_STEPS:%.+]]> = SCALAR-STEPS vp<[[PTR_IDX]]>, ir<8>, vp<[[VF]]>
-; CHECK-NEXT:     EMIT vp<[[PTR_IV_1:%.+]]> = ptradd ir<%start.1>, vp<[[PTR_IDX_STEPS]]>
-; CHECK-NEXT:     WIDEN-GEP Var[Inv] ir<%ptr.iv.2.next> = getelementptr inbounds ir<%ptr.iv.2>, ir<1>
-; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer vp<[[PTR_IV_1]]>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%ptr.iv.2.next>
-; CHECK-NEXT:     vp<[[VEC_PTR2:%.+]]> = vector-pointer ir<%ptr.iv.2>
-; CHECK-NEXT:     WIDEN ir<%lv> = load vp<[[VEC_PTR2]]>
-; CHECK-NEXT:     WIDEN ir<%add> = add ir<%lv>, ir<1>
-; CHECK-NEXT:     vp<[[VEC_PTR3:%.+]]> = vector-pointer ir<%ptr.iv.2>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR3]]>, ir<%add>
-; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT:   }
 
 ; In the test below the pointer phi %ptr.iv.2 is used as
 ;  1. As a uniform address for the load, and
@@ -80,13 +48,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP16]], ptr [[TMP17]], align 8
+; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP16]], ptr [[NEXT_GEP]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    store <vscale x 2 x i8> [[TMP20]], ptr [[TMP19]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i8> [[TMP20]], ptr [[TMP18]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -170,10 +136,9 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 2 x i64> [[TMP13]], splat (i64 1)
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP14]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    store <vscale x 2 x i8> [[TMP17]], ptr [[TMP16]], align 1
+; CHECK-NEXT:    store <vscale x 2 x i8> [[TMP17]], ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP6]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index 46b4762..3b04df3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -309,19 +309,17 @@ define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -364,19 +362,17 @@ define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_mask(<4 x i64> [[WIDE_LOAD]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -419,19 +415,17 @@ define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index f6443b1e..e55e322 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -22,8 +22,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr %dst, i64 [[IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[IDX_NEXT]] = add i64 [[IDX]], 4
 ; CHECK-NEXT:    [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
 ; CHECK-NEXT:    [[NOT_ACTIVE_LANE_MASK:%.*]] = xor <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 0326fe5..7308129 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -30,8 +30,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; NONE:       vector.body:
 ; NONE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; NONE-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; NONE-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; NONE-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4
+; NONE-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 4
 ; NONE-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]]
 ; NONE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
 ; NONE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -71,8 +70,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[UMAX]])
 ; DATA-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; DATA-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
 ; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
 ; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -119,8 +117,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_NO_LANEMASK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]])
+; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[TMP12]])
 ; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
 ; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -160,8 +157,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_AND_CONTROL-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; DATA_AND_CONTROL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA_AND_CONTROL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA_AND_CONTROL-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
 ; DATA_AND_CONTROL-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
 ; DATA_AND_CONTROL-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
@@ -208,8 +204,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index 94f46bf..3215356b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -28,7 +28,7 @@ define void @test_add_double_same_const_args_1(ptr %res, ptr noalias %A, ptr noa
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -78,7 +78,7 @@ define void @test_add_double_same_const_args_2(ptr %res, ptr noalias %A, ptr noa
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -138,7 +138,7 @@ define void @test_add_double_mixed_const_args(ptr %res, ptr noalias %A, ptr noal
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -200,7 +200,7 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -262,7 +262,7 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -324,7 +324,7 @@ define void @test_add_double_same_var_args_at_different_positions(ptr %res, ptr
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -388,7 +388,7 @@ define void @test_add_double_different_var_args_1(ptr %res, ptr noalias %A, ptr
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -452,7 +452,7 @@ define void @test_add_double_different_var_args_2(ptr %res, ptr noalias %A, ptr
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index ccfa725..ac39ecff 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -180,11 +180,10 @@ define void @test_interleave_store_one_constant(ptr noalias %src, ptr noalias %d
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[TMP13]], i32 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[TMP13]], i32 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP13]], i32 6
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP17]], align 8
@@ -226,8 +225,7 @@ define void @test_interleave_store_one_constant(ptr noalias %src, ptr noalias %d
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX14]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[TMP31]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <2 x double>, ptr [[TMP32]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <2 x double>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[TMP33:%.*]] = fmul <2 x double> [[WIDE_LOAD15]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr [2 x double], ptr [[DST]], i64 [[INDEX14]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <2 x double> [[TMP33]], <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -332,11 +330,10 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
index 3cde3f3..8f241c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-derived-ivs.ll
@@ -325,7 +325,7 @@ define void @narrow_with_uniform_add_and_gep(ptr noalias %p) {
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF2IC2-LABEL: define void @narrow_with_uniform_add_and_gep(
@@ -352,7 +352,7 @@ define void @narrow_with_uniform_add_and_gep(ptr noalias %p) {
 ; VF2IC2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; VF2IC2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2IC2:       [[MIDDLE_BLOCK]]:
-; VF2IC2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2IC2-NEXT:    br [[EXIT:label %.*]]
 ; VF2IC2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @narrow_with_uniform_add_and_gep(
@@ -378,7 +378,7 @@ define void @narrow_with_uniform_add_and_gep(ptr noalias %p) {
 ; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-metadata.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-metadata.ll
index 0a83ff6..0244414 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-metadata.ll
@@ -23,7 +23,7 @@ define void @load_store_interleave_group_with_metadata(ptr noalias %data) {
 ; VF2-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 entry:
@@ -54,11 +54,6 @@ exit:
 !4 = !{ i64 0, i64 2 }
 !5 = !{ i64 0, i64 2 }
 
-;.
-; VF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; VF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; VF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-;.
 ; VF4: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
 ; VF4: [[META1]] = !{!"omnipotent char", [[META2:![0-9]+]], i64 0}
 ; VF4: [[META2]] = !{!"Simple C/C++ TBAA"}
@@ -66,3 +61,7 @@ exit:
 ; VF4: [[META4]] = !{!"llvm.loop.isvectorized", i32 1}
 ; VF4: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"}
 ;.
+; VF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index fea57fa..d0ea828 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -21,9 +21,9 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
 ; VF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
 ; VF2:       [[SCALAR_PH]]:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF2-NEXT:    br label %[[LOOP:.*]]
 ; VF2:       [[LOOP]]:
 ; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -251,9 +251,9 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias %
 ; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
 ; VF2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br label %[[EXIT:.*]]
 ; VF2:       [[SCALAR_PH]]:
-; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF2-NEXT:    br label %[[LOOP:.*]]
 ; VF2:       [[LOOP]]:
 ; VF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -297,9 +297,9 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias %
 ; VF4-NEXT:    store <8 x float> [[INTERLEAVED_VEC]], ptr [[RES]], align 4
 ; VF4-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br label %[[EXIT:.*]]
 ; VF4:       [[SCALAR_PH]]:
-; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF4-NEXT:    br label %[[LOOP:.*]]
 ; VF4:       [[LOOP]]:
 ; VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index 3a7b448..4df02a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -26,9 +26,9 @@ define void @load_store_interleave_group(ptr noalias %data) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -78,9 +78,8 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
@@ -106,9 +105,9 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index 38e224f..e04b550 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -23,7 +23,7 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
@@ -48,7 +48,7 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
 ; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -85,13 +85,11 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
 ; VF2-NEXT:    [[TMP14:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[TMP14]], 1
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP14]]
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[WIDE_LOAD3]]
 ; VF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP14]]
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = fneg <2 x double> [[WIDE_LOAD1]]
 ; VF2-NEXT:    [[TMP17:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; VF2-NEXT:    [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x double> [[TMP17]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -100,7 +98,7 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_unary_op_wide_load(
@@ -113,13 +111,11 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
 ; VF4-NEXT:    [[TMP4:%.*]] = fneg <4 x double> [[WIDE_LOAD]]
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
 ; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP6]], align 8
 ; VF4-NEXT:    [[TMP8:%.*]] = fneg <4 x double> [[WIDE_LOAD1]]
 ; VF4-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -128,7 +124,7 @@ define void @test_2xi64_unary_op_wide_load(ptr noalias %data, ptr noalias %A, pt
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -178,7 +174,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64(
@@ -190,8 +186,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP11]], align 8
@@ -206,7 +201,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -244,8 +239,7 @@ define void @test_2xi64_different_opcodes(ptr noalias %data, ptr noalias %factor
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
 ; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
@@ -260,7 +254,7 @@ define void @test_2xi64_different_opcodes(ptr noalias %data, ptr noalias %factor
 ; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_different_opcodes(
@@ -272,8 +266,7 @@ define void @test_2xi64_different_opcodes(ptr noalias %data, ptr noalias %factor
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
@@ -288,7 +281,7 @@ define void @test_2xi64_different_opcodes(ptr noalias %data, ptr noalias %factor
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -326,8 +319,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP15:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP15]]
 ; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP16]], align 8
@@ -342,7 +334,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_interleave_loads_order_flipped(
@@ -354,8 +346,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP27:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP28]], align 8
@@ -370,7 +361,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -408,8 +399,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
@@ -424,7 +414,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_store_order_flipped_1(
@@ -436,8 +426,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP11]], align 8
@@ -452,7 +441,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -490,8 +479,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
@@ -506,7 +494,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_store_order_flipped_2(
@@ -518,8 +506,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP11]], align 8
@@ -534,7 +521,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -572,26 +559,24 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
-; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF2-NEXT:    [[TMP14:%.*]] = or disjoint i64 [[TMP6]], 1
 ; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
 ; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
 ; VF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP21]], align 8
+; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP20]], align 8
 ; VF2-NEXT:    [[TMP22:%.*]] = mul <2 x i64> [[WIDE_LOAD2]], [[TMP19]]
 ; VF2-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP22]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98
-; VF2-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98
+; VF2-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br label %[[SCALAR_PH]]
 ; VF2:       [[SCALAR_PH]]:
@@ -605,19 +590,17 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP10]]
-; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4-NEXT:    [[TMP24:%.*]] = or disjoint i64 [[TMP10]], 1
 ; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP24]]
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP25]], align 8
 ; VF4-NEXT:    [[TMP33:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP35]], align 8
+; VF4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP34]], align 8
 ; VF4-NEXT:    [[TMP36:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[TMP33]]
 ; VF4-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP36]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP13]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -666,8 +649,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
 ; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <6 x i64>, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = shufflevector <6 x i64> [[WIDE_VEC]], <6 x i64> poison, <2 x i32> <i32 0, i32 3>
@@ -685,7 +667,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_3xi64(
@@ -697,8 +679,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
 ; VF4-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP6]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -716,7 +697,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -973,7 +954,7 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
 ; VF2-NEXT:    br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_sub_of_wide_loads(
@@ -985,11 +966,9 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
@@ -1001,7 +980,7 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -1039,11 +1018,9 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; VF2-NEXT:    [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
 ; VF2-NEXT:    [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD1]]
 ; VF2-NEXT:    [[TMP20:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
@@ -1055,7 +1032,7 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
 ; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
 ; VF2-NEXT:    br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_sub_of_wide_loads_ops_swapped(
@@ -1067,11 +1044,9 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
@@ -1083,7 +1058,7 @@ define void @test_2xi64_sub_of_wide_loads_ops_swapped(ptr noalias %data, ptr noa
 ; VF4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -1121,17 +1096,14 @@ define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(ptr noalias %
 ; VF2:       [[VECTOR_BODY]]:
 ; VF2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF2-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; VF2-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; VF2-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -1140,7 +1112,7 @@ define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(ptr noalias %
 ; VF2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(
@@ -1152,17 +1124,14 @@ define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(ptr noalias %
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; VF4-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
 ; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[C]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
 ; VF4-NEXT:    [[TMP10:%.*]] = sub <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; VF4-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -1171,7 +1140,7 @@ define void @test_2xi64_sub_of_wide_loads_with_different_base_ptrs(ptr noalias %
 ; VF4-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 6acd798..41b89dc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -22,7 +22,7 @@ define void @load_store_interleave_group(ptr noalias %data) {
 ; VF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @load_store_interleave_group(
@@ -45,7 +45,7 @@ define void @load_store_interleave_group(ptr noalias %data) {
 ; VF4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -87,7 +87,7 @@ define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @load_store_interleave_group_different_objecs(
@@ -111,7 +111,7 @@ define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr
 ; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -148,8 +148,7 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
 ; VF2-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP7]], 1
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP9]]
 ; VF2-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[WIDE_LOAD1]], <2 x i64> [[WIDE_LOAD1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; VF2-NEXT:    [[INTERLEAVED_VEC2:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -158,7 +157,7 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @single_wide_load_store_interleave_group(
@@ -171,8 +170,7 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
 ; VF4-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP7]], 1
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP9]]
 ; VF4-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD1]], <4 x i64> [[WIDE_LOAD1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; VF4-NEXT:    [[INTERLEAVED_VEC2:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -181,7 +179,7 @@ define void @single_wide_load_store_interleave_group(ptr noalias %src, ptr noali
 ; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -221,7 +219,7 @@ define void @same_constant_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF2-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @same_constant_store_interleave_group(
@@ -239,7 +237,7 @@ define void @same_constant_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF4-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -277,7 +275,7 @@ define void @different_constants_store_interleave_group(i64 %x, i64 %y, ptr noal
 ; VF2-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @different_constants_store_interleave_group(
@@ -295,7 +293,7 @@ define void @different_constants_store_interleave_group(i64 %x, i64 %y, ptr noal
 ; VF4-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -337,7 +335,7 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @same_live_in_store_interleave_group(
@@ -359,7 +357,7 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF4-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -403,7 +401,7 @@ define void @different_live_ins_store_interleave_group(i64 %x, i64 %y, ptr noali
 ; VF2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @different_live_ins_store_interleave_group(
@@ -427,7 +425,7 @@ define void @different_live_ins_store_interleave_group(i64 %x, i64 %y, ptr noali
 ; VF4-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -468,7 +466,7 @@ define void @single_uniform_load_store_interleave_group(ptr noalias %src, ptr no
 ; VF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @single_uniform_load_store_interleave_group(
@@ -491,7 +489,7 @@ define void @single_uniform_load_store_interleave_group(ptr noalias %src, ptr no
 ; VF4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
@@ -538,7 +536,7 @@ define void @multiple_uniform_load_store_interleave_group(ptr noalias %src.0, pt
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
-; VF2-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF2-NEXT:    br [[EXIT:label %.*]]
 ; VF2:       [[SCALAR_PH]]:
 ;
 ; VF4-LABEL: define void @multiple_uniform_load_store_interleave_group(
@@ -564,7 +562,7 @@ define void @multiple_uniform_load_store_interleave_group(ptr noalias %src.0, pt
 ; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; VF4-NEXT:    br [[EXIT:label %.*]]
 ; VF4:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
index c017c1f..a2cbf6f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
@@ -18,40 +18,39 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[WIDE_LOAD]], splat (i16 10)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[WIDE_LOAD]], splat (i16 10)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr exact i64 [[TMP10]], 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = ashr exact i64 [[TMP11]], 32
 ; CHECK-NEXT:    [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32
 ; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    store i16 [[TMP22]], ptr [[TMP18]], align 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP23]], ptr [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
 ; CHECK-NEXT:    store i16 [[TMP24]], ptr [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3
-; CHECK-NEXT:    store i16 [[TMP26]], ptr [[TMP22]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_INC1286_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_INC1286_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[IF_THEN1165_US:%.*]]
 ; CHECK:       if.then1165.us:
 ; CHECK-NEXT:    [[INDVARS_IV1783:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ]
@@ -111,41 +110,40 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32
 ; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
 ; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
 ; CHECK-NEXT:    [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0
+; CHECK-NEXT:    store i16 [[TMP24]], ptr [[TMP20]], align 2
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2
 ; CHECK-NEXT:    store i16 [[TMP26]], ptr [[TMP22]], align 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
 ; CHECK-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3
-; CHECK-NEXT:    store i16 [[TMP28]], ptr [[TMP24]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_INC1286_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_INC1286_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[IF_THEN1165_US:%.*]]
 ; CHECK:       if.then1165.us:
 ; CHECK-NEXT:    [[INDVARS_IV1783:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
index 2705d69..a431fdd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
@@ -17,11 +17,10 @@ define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c)  {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i32 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i32 32
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i32 48
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP1]], align 4
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[DST]], align 4
 ; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 4
@@ -40,8 +39,7 @@ define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c)  {
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[VEC_EPILOG_RESUME_VAL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TC]], [[N_VEC3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 399d676..bba9293 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -90,12 +90,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
-; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
-; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
-; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
-; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 1f2e918..d014468 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -109,8 +109,7 @@ define void @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @__simd_sin_v2f64(<2 x double> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index 3545c6b..c1d4317 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -68,7 +68,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; NARROW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; NARROW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NARROW:       middle.block:
-; NARROW-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NARROW-NEXT:    br label [[SCALAR_PH]]
 ; NARROW:       scalar.ph:
 ; NARROW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; NARROW-NEXT:    br label [[FOR_BODY:%.*]]
@@ -82,7 +82,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; NARROW-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
 ; NARROW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NARROW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
-; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NARROW:       for.cond.cleanup:
 ; NARROW-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
index ab7bb66..8aab77e 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@@ -25,7 +25,7 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; GFX9:       middle.block:
 ; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[BIN_RDX]])
-; GFX9-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; GFX9-NEXT:    br label [[FOR_END:%.*]]
 ; GFX9:       scalar.ph:
 ; GFX9-NEXT:    br label [[FOR_BODY:%.*]]
 ; GFX9:       for.body:
@@ -55,7 +55,7 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; VI:       middle.block:
 ; VI-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
 ; VI-NEXT:    [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[BIN_RDX]])
-; VI-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; VI-NEXT:    br label [[FOR_END:%.*]]
 ; VI:       scalar.ph:
 ; VI-NEXT:    br label [[FOR_BODY:%.*]]
 ; VI:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index 9522c7e..66bb80b 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -17,12 +17,10 @@ define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -84,8 +82,7 @@ define void @test_stride-1_4i32(ptr readonly %data, ptr noalias nocapture %dst,
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> splat (i32 5), [[REVERSE]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -149,8 +146,7 @@ define void @test_stride2_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> splat (i32 5), [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -211,8 +207,7 @@ define void @test_stride3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -274,8 +269,7 @@ define void @test_stride4_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -335,12 +329,10 @@ define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -403,8 +395,7 @@ define void @test_stride_noninvar_4i32(ptr readonly %data, ptr noalias nocapture
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 32)
@@ -523,8 +514,7 @@ define void @test_stride_noninvar3_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP6]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
index 68f7e53..029bffd 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
@@ -70,14 +70,12 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP7]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP11]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[INNER_LOOP_EXIT]]
 ; CHECK:       scalar.ph:
@@ -86,9 +84,9 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[J_021_US:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_US:%.*]], [[INNER_LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr i32, ptr [[TMP7]], i32 [[J_021_US]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[J_021_US]]
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[ARRAYIDX8_US]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[J_021_US]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_US]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll
index 951e3c1..4fc2778 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll
@@ -30,11 +30,9 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -103,11 +101,9 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index 6d42dee..bc02595 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -436,7 +436,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -448,10 +448,9 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[V0]] = call i32 @llvm.smin.i32(i32 [[RESULT_08]], i32 [[L0]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ poison, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[V0]]
 ;
 entry:
   br label %for.body
@@ -489,7 +488,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -501,10 +500,9 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[V0]] = call i32 @llvm.umax.i32(i32 [[RESULT_08]], i32 [[L0]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ poison, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[V0]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index 18d607f..f1bee3b 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -21,11 +21,9 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
@@ -103,11 +101,9 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
@@ -185,8 +181,7 @@ define i32 @add_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
@@ -251,8 +246,7 @@ define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -317,8 +311,7 @@ define i32 @and_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -383,8 +376,7 @@ define i32 @or_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -449,8 +441,7 @@ define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -515,8 +506,7 @@ define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -581,8 +571,7 @@ define float @fmul_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float 1.000000e+00), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -646,8 +635,7 @@ define i32 @smin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 2147483647), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -714,8 +702,7 @@ define i32 @smax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -782,8 +769,7 @@ define i32 @umin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -850,8 +836,7 @@ define i32 @umax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index 1d898fb..0f4d40f 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -24,16 +24,14 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; DEFAULT-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    store <4 x i32> [[TMP3]], ptr [[P]], align 4
 ; DEFAULT-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
 ; DEFAULT:       [[FOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -56,16 +54,14 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; OPTSIZE:       [[VECTOR_BODY]]:
-; OPTSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; OPTSIZE-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; OPTSIZE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; OPTSIZE-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4
+; OPTSIZE-NEXT:    store <4 x i32> [[TMP3]], ptr [[P]], align 4
 ; OPTSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
-; OPTSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; OPTSIZE:       [[SCALAR_PH]]:
-; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; OPTSIZE:       [[FOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -88,16 +84,14 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; MINSIZE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; MINSIZE:       [[VECTOR_BODY]]:
-; MINSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; MINSIZE-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; MINSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 0
-; MINSIZE-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; MINSIZE-NEXT:    store <4 x i32> [[TMP2]], ptr [[P]], align 4
 ; MINSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
-; MINSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; MINSIZE:       [[SCALAR_PH]]:
-; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; MINSIZE:       [[FOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -145,10 +139,9 @@ define void @vectorize_without_optsize(ptr %p, i32 %x, i64 %n) {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
-; DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -235,9 +228,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ]
-; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ]
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ]
+; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ]
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14)
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
@@ -256,140 +249,140 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE]]:
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; DEFAULT:       [[PRED_STORE_IF7]]:
+; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; DEFAULT:       [[PRED_STORE_IF6]]:
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 1
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]]
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1
 ; DEFAULT-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
-; DEFAULT:       [[PRED_STORE_CONTINUE8]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; DEFAULT:       [[PRED_STORE_CONTINUE7]]:
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
-; DEFAULT:       [[PRED_STORE_IF9]]:
+; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; DEFAULT:       [[PRED_STORE_IF8]]:
 ; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]]
 ; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2
 ; DEFAULT-NEXT:    store i8 [[TMP19]], ptr [[TMP18]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
-; DEFAULT:       [[PRED_STORE_CONTINUE10]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; DEFAULT:       [[PRED_STORE_CONTINUE9]]:
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
-; DEFAULT:       [[PRED_STORE_IF11]]:
+; DEFAULT-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
+; DEFAULT:       [[PRED_STORE_IF10]]:
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]]
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3
 ; DEFAULT-NEXT:    store i8 [[TMP23]], ptr [[TMP22]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
-; DEFAULT:       [[PRED_STORE_CONTINUE12]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; DEFAULT:       [[PRED_STORE_CONTINUE11]]:
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
-; DEFAULT:       [[PRED_STORE_IF13]]:
+; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; DEFAULT:       [[PRED_STORE_IF12]]:
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]]
 ; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4
 ; DEFAULT-NEXT:    store i8 [[TMP27]], ptr [[TMP26]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
-; DEFAULT:       [[PRED_STORE_CONTINUE14]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
+; DEFAULT:       [[PRED_STORE_CONTINUE13]]:
 ; DEFAULT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; DEFAULT-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
-; DEFAULT:       [[PRED_STORE_IF15]]:
+; DEFAULT-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
+; DEFAULT:       [[PRED_STORE_IF14]]:
 ; DEFAULT-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 5
 ; DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]]
 ; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5
 ; DEFAULT-NEXT:    store i8 [[TMP31]], ptr [[TMP30]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
-; DEFAULT:       [[PRED_STORE_CONTINUE16]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
+; DEFAULT:       [[PRED_STORE_CONTINUE15]]:
 ; DEFAULT-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; DEFAULT-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
-; DEFAULT:       [[PRED_STORE_IF17]]:
+; DEFAULT-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
+; DEFAULT:       [[PRED_STORE_IF16]]:
 ; DEFAULT-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 6
 ; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]]
 ; DEFAULT-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6
 ; DEFAULT-NEXT:    store i8 [[TMP35]], ptr [[TMP34]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
-; DEFAULT:       [[PRED_STORE_CONTINUE18]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
+; DEFAULT:       [[PRED_STORE_CONTINUE17]]:
 ; DEFAULT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; DEFAULT-NEXT:    br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
-; DEFAULT:       [[PRED_STORE_IF19]]:
+; DEFAULT-NEXT:    br i1 [[TMP36]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
+; DEFAULT:       [[PRED_STORE_IF18]]:
 ; DEFAULT-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 7
 ; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]]
 ; DEFAULT-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7
 ; DEFAULT-NEXT:    store i8 [[TMP39]], ptr [[TMP38]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
-; DEFAULT:       [[PRED_STORE_CONTINUE20]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE19]]
+; DEFAULT:       [[PRED_STORE_CONTINUE19]]:
 ; DEFAULT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; DEFAULT-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
-; DEFAULT:       [[PRED_STORE_IF21]]:
+; DEFAULT-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
+; DEFAULT:       [[PRED_STORE_IF20]]:
 ; DEFAULT-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], 8
 ; DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]]
 ; DEFAULT-NEXT:    [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8
 ; DEFAULT-NEXT:    store i8 [[TMP43]], ptr [[TMP42]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
-; DEFAULT:       [[PRED_STORE_CONTINUE22]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE21]]
+; DEFAULT:       [[PRED_STORE_CONTINUE21]]:
 ; DEFAULT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
-; DEFAULT:       [[PRED_STORE_IF23]]:
+; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
+; DEFAULT:       [[PRED_STORE_IF22]]:
 ; DEFAULT-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX]], 9
 ; DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]]
 ; DEFAULT-NEXT:    [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9
 ; DEFAULT-NEXT:    store i8 [[TMP47]], ptr [[TMP46]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
-; DEFAULT:       [[PRED_STORE_CONTINUE24]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE23]]
+; DEFAULT:       [[PRED_STORE_CONTINUE23]]:
 ; DEFAULT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; DEFAULT-NEXT:    br i1 [[TMP48]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
-; DEFAULT:       [[PRED_STORE_IF25]]:
+; DEFAULT-NEXT:    br i1 [[TMP48]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
+; DEFAULT:       [[PRED_STORE_IF24]]:
 ; DEFAULT-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX]], 10
 ; DEFAULT-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]]
 ; DEFAULT-NEXT:    [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10
 ; DEFAULT-NEXT:    store i8 [[TMP51]], ptr [[TMP50]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
-; DEFAULT:       [[PRED_STORE_CONTINUE26]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE25]]
+; DEFAULT:       [[PRED_STORE_CONTINUE25]]:
 ; DEFAULT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; DEFAULT-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
-; DEFAULT:       [[PRED_STORE_IF27]]:
+; DEFAULT-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]]
+; DEFAULT:       [[PRED_STORE_IF26]]:
 ; DEFAULT-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX]], 11
 ; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]]
 ; DEFAULT-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11
 ; DEFAULT-NEXT:    store i8 [[TMP55]], ptr [[TMP54]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
-; DEFAULT:       [[PRED_STORE_CONTINUE28]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE27]]
+; DEFAULT:       [[PRED_STORE_CONTINUE27]]:
 ; DEFAULT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; DEFAULT-NEXT:    br i1 [[TMP56]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
-; DEFAULT:       [[PRED_STORE_IF29]]:
+; DEFAULT-NEXT:    br i1 [[TMP56]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; DEFAULT:       [[PRED_STORE_IF28]]:
 ; DEFAULT-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 12
 ; DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]]
 ; DEFAULT-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12
 ; DEFAULT-NEXT:    store i8 [[TMP59]], ptr [[TMP58]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
-; DEFAULT:       [[PRED_STORE_CONTINUE30]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE29]]
+; DEFAULT:       [[PRED_STORE_CONTINUE29]]:
 ; DEFAULT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; DEFAULT-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
-; DEFAULT:       [[PRED_STORE_IF31]]:
+; DEFAULT-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; DEFAULT:       [[PRED_STORE_IF30]]:
 ; DEFAULT-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 13
 ; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]]
 ; DEFAULT-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13
 ; DEFAULT-NEXT:    store i8 [[TMP63]], ptr [[TMP62]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
-; DEFAULT:       [[PRED_STORE_CONTINUE32]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE31]]
+; DEFAULT:       [[PRED_STORE_CONTINUE31]]:
 ; DEFAULT-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; DEFAULT-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
-; DEFAULT:       [[PRED_STORE_IF33]]:
+; DEFAULT-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; DEFAULT:       [[PRED_STORE_IF32]]:
 ; DEFAULT-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 14
 ; DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]]
 ; DEFAULT-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14
 ; DEFAULT-NEXT:    store i8 [[TMP67]], ptr [[TMP66]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE34]]
-; DEFAULT:       [[PRED_STORE_CONTINUE34]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
+; DEFAULT:       [[PRED_STORE_CONTINUE33]]:
 ; DEFAULT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; DEFAULT-NEXT:    br i1 [[TMP68]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36]]
-; DEFAULT:       [[PRED_STORE_IF35]]:
+; DEFAULT-NEXT:    br i1 [[TMP68]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]]
+; DEFAULT:       [[PRED_STORE_IF34]]:
 ; DEFAULT-NEXT:    [[TMP69:%.*]] = add i64 [[INDEX]], 15
 ; DEFAULT-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]]
 ; DEFAULT-NEXT:    [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
-; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
+; DEFAULT:       [[PRED_STORE_CONTINUE35]]:
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
@@ -499,25 +492,22 @@ define void @dont_vectorize_with_minsize() {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16>
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
 ; DEFAULT:       [[FOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -546,25 +536,22 @@ define void @dont_vectorize_with_minsize() {
 ; OPTSIZE:       [[VECTOR_BODY]]:
 ; OPTSIZE-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; OPTSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]]
-; OPTSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; OPTSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]]
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; OPTSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; OPTSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]]
-; OPTSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; OPTSIZE-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; OPTSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
-; OPTSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; OPTSIZE:       [[SCALAR_PH]]:
-; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; OPTSIZE:       [[FOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -593,25 +580,22 @@ define void @dont_vectorize_with_minsize() {
 ; MINSIZE:       [[VECTOR_BODY]]:
 ; MINSIZE-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; MINSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]]
-; MINSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; MINSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]]
-; MINSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; MINSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; MINSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]]
-; MINSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; MINSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
-; MINSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; MINSIZE:       [[SCALAR_PH]]:
-; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; MINSIZE:       [[FOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -668,25 +652,22 @@ define void @vectorization_forced() {
 ; DEFAULT:       [[VECTOR_BODY]]:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP4]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP9]], i32 0
-; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16>
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP10]], align 2
+; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
 ; DEFAULT:       [[FOR_BODY]]:
 ; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -715,25 +696,22 @@ define void @vectorization_forced() {
 ; OPTSIZE:       [[VECTOR_BODY]]:
 ; OPTSIZE-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; OPTSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]]
-; OPTSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; OPTSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]]
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; OPTSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; OPTSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; OPTSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]]
-; OPTSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; OPTSIZE-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; OPTSIZE-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; OPTSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; OPTSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
-; OPTSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; OPTSIZE:       [[SCALAR_PH]]:
-; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; OPTSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; OPTSIZE:       [[FOR_BODY]]:
 ; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -762,25 +740,22 @@ define void @vectorization_forced() {
 ; MINSIZE:       [[VECTOR_BODY]]:
 ; MINSIZE-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; MINSIZE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[TMP0]]
-; MINSIZE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; MINSIZE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[TMP0]]
-; MINSIZE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; MINSIZE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; MINSIZE-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; MINSIZE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[TMP0]]
-; MINSIZE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP6]], i32 0
-; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = add <2 x i16> [[TMP8]], [[WIDE_LOAD2]]
-; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP7]], align 2
+; MINSIZE-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP6]], align 2
 ; MINSIZE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; MINSIZE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
-; MINSIZE-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; MINSIZE:       [[SCALAR_PH]]:
-; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MINSIZE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
 ; MINSIZE:       [[FOR_BODY]]:
 ; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
index fa03e29..7c6e705 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -62,8 +62,8 @@ define hidden void @pointer_phi_v4i32_add2(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP0]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
@@ -106,8 +106,8 @@ define hidden void @pointer_phi_v4i32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
@@ -116,8 +116,8 @@ define hidden void @pointer_phi_v4i32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 48
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
@@ -212,8 +212,8 @@ define hidden void @pointer_phi_v8i16_add2(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <8 x i16> [[TMP1]], ptr [[NEXT_GEP5]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_011:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
@@ -495,8 +495,8 @@ define hidden void @pointer_phi_v4f32_add2(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
@@ -539,8 +539,8 @@ define hidden void @pointer_phi_v4f32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
@@ -549,8 +549,8 @@ define hidden void @pointer_phi_v4f32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 48
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
@@ -642,8 +642,8 @@ define hidden void @pointer_phi_v4half_add2(ptr noalias nocapture readonly %A, p
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
@@ -696,8 +696,8 @@ define hidden void @pointer_phi_v4half_add3(ptr noalias nocapture readonly %A, p
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
@@ -743,8 +743,8 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 24, i32 48, i32 72>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 96, i32 120, i32 144, i32 168>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
@@ -808,8 +808,8 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[ENTRY]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 24, i32 48, i32 72>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 96, i32 120, i32 144, i32 168>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 192, i32 216, i32 240, i32 264>
@@ -880,8 +880,8 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias n
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ], [ [[X]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI5:%.*]] = phi ptr [ [[PTR_IND6:%.*]], [[VECTOR_BODY]] ], [ [[Z]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
index ce714f6..91b0508 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
@@ -45,17 +45,14 @@ define i32 @test(ptr nocapture readonly %x) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[T4]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP6]] to <2 x double>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x double> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[T6]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP12]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
index 2c659d1..957f8ee 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -14,25 +14,21 @@ define void @trunc_not_allowed_different_vec_elemns(ptr noalias nocapture %A, pt
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <4 x i16> [[TMP8]], splat (i16 1)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[D:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -53,7 +49,7 @@ define void @trunc_not_allowed_different_vec_elemns(ptr noalias nocapture %A, pt
 ; CHECK-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX8]], align 2
 ; CHECK-NEXT:    [[ADD9]] = add nuw nsw i32 [[I_021]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD9]], 431
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -127,21 +123,18 @@ define void @narrowing_load_not_allowed(ptr noalias nocapture %A, ptr noalias no
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[C:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i16> [[WIDE_LOAD]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i8> [[WIDE_LOAD1]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 424
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 424, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -159,7 +152,7 @@ define void @narrowing_load_not_allowed(ptr noalias nocapture %A, ptr noalias no
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[ADD6]] = add nuw nsw i32 [[I_012]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -196,20 +189,17 @@ define void @trunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture r
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -227,7 +217,7 @@ define void @trunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture r
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[ADD_IV:%.*]] = trunc i32 [[ADD3]] to i16
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i16 [[ADD_IV]], 431
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -264,8 +254,7 @@ define void @strides_different_direction(ptr noalias nocapture %A, ptr noalias n
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i32 [[N:%.*]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
@@ -274,13 +263,12 @@ define void @strides_different_direction(ptr noalias nocapture %A, ptr noalias n
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[REVERSE]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -298,7 +286,7 @@ define void @strides_different_direction(ptr noalias nocapture %A, ptr noalias n
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -330,20 +318,17 @@ define void @too_many_loop_blocks(ptr noalias nocapture %A, ptr noalias nocaptur
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -362,7 +347,7 @@ define void @too_many_loop_blocks(ptr noalias nocapture %A, ptr noalias nocaptur
 ; CHECK:       loopincr:
 ; CHECK-NEXT:    [[ADD3]] = add nuw nsw i32 [[I_09]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD3]], 431
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -435,25 +420,21 @@ define void @fptrunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <4 x float> [[TMP5]] to <4 x half>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x half> [[TMP8]], splat (half 0xH4000)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds half, ptr [[D:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x half> [[TMP9]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <4 x half> [[TMP9]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -474,7 +455,7 @@ define void @fptrunc_not_allowed(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    store half [[FACTOR]], ptr [[ARRAYIDX5]], align 2
 ; CHECK-NEXT:    [[ADD6]] = add nuw nsw i32 [[I_017]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -525,15 +506,13 @@ define dso_local void @select_not_allowed(ptr noalias nocapture %A, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[TMP4]], <4 x i32> [[VEC_IND]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -606,8 +585,7 @@ define i32 @i32_smin_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 2147483647), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -674,8 +652,7 @@ define i32 @i32_smax_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -742,8 +719,7 @@ define i32 @i32_umin_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -810,8 +786,7 @@ define i32 @i32_umax_reduction(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
index 06e345f..e08614b 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
@@ -25,10 +25,8 @@ define void @outside_user_blocks_tail_folding(ptr nocapture readonly %ptr, i32 %
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
index 631f3c7..17eeafa 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
@@ -22,21 +22,20 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index b498712..1d0751a 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -51,7 +51,6 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[STEP_ADD_10:%.*]] = add <2 x i64> [[STEP_ADD_9]], splat (i64 2)
 ; CHECK-NEXT:    [[STEP_ADD_11:%.*]] = add <2 x i64> [[STEP_ADD_10]], splat (i64 2)
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 6
@@ -63,7 +62,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 18
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 20
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 22
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <2 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <2 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <2 x i8>, ptr [[TMP14]], align 1
@@ -163,8 +162,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[VEC_IND27:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT28:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI29:%.*]] = phi <2 x i64> [ [[TMP57]], %[[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP30:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX38]]
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[NEXT_GEP30]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i8>, ptr [[TMP60]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i8>, ptr [[NEXT_GEP30]], align 1
 ; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[WIDE_LOAD32]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP62:%.*]] = shl <2 x i64> [[VEC_IND27]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP63:%.*]] = shl <2 x i64> [[TMP61]], [[TMP62]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index ed83af6..4d14624 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -26,7 +26,6 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK:       vector.body:
 ; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDEX]]
-; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 4
 ; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 8
 ; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 12
@@ -34,7 +33,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 20
 ; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 24
 ; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 28
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP17]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
@@ -43,7 +42,6 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP23]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDEX]]
-; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 4
 ; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 8
 ; VF-TWO-CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 12
@@ -51,7 +49,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 20
 ; VF-TWO-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 24
 ; VF-TWO-CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 28
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
 ; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP35]], align 4
@@ -68,7 +66,6 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD15]]
 ; VF-TWO-CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD16]]
 ; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDEX]]
-; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
 ; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
 ; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
@@ -76,7 +73,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
 ; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
 ; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[TMP56]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[TMP48]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP57]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP58]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP43]], ptr [[TMP59]], align 4
@@ -102,15 +99,12 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
 ; VF-TWO-CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDEX20]]
-; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP66]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <2 x float>, ptr [[TMP67]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <2 x float>, ptr [[TMP66]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDEX20]]
-; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <2 x float>, ptr [[TMP69]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <2 x float>, ptr [[TMP68]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = fadd fast <2 x float> [[WIDE_LOAD21]], [[WIDE_LOAD22]]
 ; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDEX20]]
-; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 0
-; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP70]], ptr [[TMP72]], align 4
+; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP70]], ptr [[TMP71]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[INDEX20]], 2
 ; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC18]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[TMP73]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -156,7 +150,6 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK:       vector.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDEX]]
-; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 8
 ; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 12
@@ -164,7 +157,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 20
 ; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 24
 ; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP17]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
@@ -173,7 +166,6 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP23]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDEX]]
-; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 8
 ; VF-FOUR-CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 12
@@ -181,7 +173,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 20
 ; VF-FOUR-CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 24
 ; VF-FOUR-CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP24]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP32]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP35]], align 4
@@ -198,7 +190,6 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD15]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD16]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDEX]]
-; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8
 ; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12
@@ -206,7 +197,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20
 ; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24
 ; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[TMP56]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP40]], ptr [[TMP48]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP57]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP58]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP43]], ptr [[TMP59]], align 4
@@ -232,15 +223,12 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; VF-FOUR-CHECK:       vec.epilog.vector.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDEX20]]
-; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, ptr [[TMP66]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP67]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP66]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDEX20]]
-; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP69]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP68]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = fadd fast <4 x float> [[WIDE_LOAD21]], [[WIDE_LOAD22]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDEX20]]
-; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 0
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP70]], ptr [[TMP72]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP70]], ptr [[TMP71]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[INDEX20]], 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC18]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[TMP73]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -375,7 +363,6 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = fadd fast <4 x float> [[REVERSE13]], splat (float 1.000000e+00)
 ; VF-TWO-CHECK-NEXT:    [[TMP79:%.*]] = fadd fast <4 x float> [[REVERSE15]], splat (float 1.000000e+00)
 ; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 4
 ; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 8
 ; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 12
@@ -383,7 +370,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 20
 ; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 24
 ; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 28
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP72]], ptr [[TMP88]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP72]], ptr [[TMP80]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP73]], ptr [[TMP89]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP74]], ptr [[TMP90]], align 4
 ; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP75]], ptr [[TMP91]], align 4
@@ -421,8 +408,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-TWO-CHECK-NEXT:    [[REVERSE24:%.*]] = shufflevector <2 x float> [[WIDE_LOAD23]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 ; VF-TWO-CHECK-NEXT:    [[TMP105:%.*]] = fadd fast <2 x float> [[REVERSE24]], splat (float 1.000000e+00)
 ; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX21]]
-; VF-TWO-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[TMP106]], i32 0
-; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP105]], ptr [[TMP107]], align 4
+; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP105]], ptr [[TMP106]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT25]] = add nuw i64 [[INDEX21]], 2
 ; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC17]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[TMP108]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -530,7 +516,6 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = fadd fast <4 x float> [[REVERSE13]], splat (float 1.000000e+00)
 ; VF-FOUR-CHECK-NEXT:    [[TMP79:%.*]] = fadd fast <4 x float> [[REVERSE15]], splat (float 1.000000e+00)
 ; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 8
 ; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 12
@@ -538,7 +523,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 20
 ; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 24
 ; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, ptr [[TMP80]], i32 28
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP72]], ptr [[TMP88]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP72]], ptr [[TMP80]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP73]], ptr [[TMP89]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP74]], ptr [[TMP90]], align 4
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP75]], ptr [[TMP91]], align 4
@@ -576,8 +561,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE24:%.*]] = shufflevector <4 x float> [[WIDE_LOAD23]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF-FOUR-CHECK-NEXT:    [[TMP105:%.*]] = fadd fast <4 x float> [[REVERSE24]], splat (float 1.000000e+00)
 ; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX21]]
-; VF-FOUR-CHECK-NEXT:    [[TMP107:%.*]] = getelementptr inbounds float, ptr [[TMP106]], i32 0
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP105]], ptr [[TMP107]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP105]], ptr [[TMP106]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT25]] = add nuw i64 [[INDEX21]], 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC17]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[TMP108]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
index 1e64a20..4dd5403 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
@@ -34,7 +34,6 @@ define void @test(ptr %arr, i32 %len) {
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 6
@@ -42,7 +41,7 @@ define void @test(ptr %arr, i32 %len) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 10
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 12
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 14
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
@@ -87,8 +86,7 @@ define void @test(ptr %arr, i32 %len) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT27:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI25:%.*]] = phi <2 x double> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP26:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, ptr [[ARR]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
 ; CHECK-NEXT:    [[TMP26]] = fadd fast <2 x double> [[WIDE_LOAD26]], [[VEC_PHI25]]
 ; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[TMP23]], 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC23]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
index a515b10..2c85b75 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
@@ -17,13 +17,11 @@ define dso_local void @test(ptr %Arr, i32 signext %Len) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
index 2b4d8b9..21266e5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFBFMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s -check-prefix=NO-ZVFBFMIN-PREDICATED
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -S | FileCheck %s -check-prefix=ZVFBFMIN
 
 define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
@@ -21,6 +22,24 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; NO-ZVFBFMIN:       [[EXIT]]:
 ; NO-ZVFBFMIN-NEXT:    ret void
 ;
+; NO-ZVFBFMIN-PREDICATED-LABEL: define void @fadd(
+; NO-ZVFBFMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFBFMIN-PREDICATED-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[LOOP]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    store bfloat [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-ZVFBFMIN-PREDICATED:       [[EXIT]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    ret void
+;
 ; ZVFBFMIN-LABEL: define void @fadd(
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
@@ -40,12 +59,10 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
 ; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
-; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP3]], align 2
-; ZVFBFMIN-NEXT:    [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP4]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP1]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP2]], align 2
 ; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = fadd <vscale x 8 x bfloat> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; ZVFBFMIN-NEXT:    store <vscale x 8 x bfloat> [[TMP11]], ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT:    store <vscale x 8 x bfloat> [[TMP11]], ptr [[TMP1]], align 2
 ; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], [[TMP5]]
 ; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFBFMIN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -101,16 +118,13 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; NO-ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
 ; NO-ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
 ; NO-ZVFBFMIN-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP0]]
-; NO-ZVFBFMIN-NEXT:    [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
-; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x bfloat>, ptr [[TMP4]], align 2
-; NO-ZVFBFMIN-NEXT:    [[TMP5:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
-; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x bfloat>, ptr [[TMP5]], align 2
-; NO-ZVFBFMIN-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
-; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x bfloat>, ptr [[TMP1]], align 2
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x bfloat>, ptr [[TMP2]], align 2
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; NO-ZVFBFMIN-NEXT:    [[TMP7:%.*]] = fpext <4 x bfloat> [[WIDE_LOAD]] to <4 x float>
 ; NO-ZVFBFMIN-NEXT:    [[TMP8:%.*]] = fpext <4 x bfloat> [[WIDE_LOAD1]] to <4 x float>
 ; NO-ZVFBFMIN-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[WIDE_LOAD2]])
-; NO-ZVFBFMIN-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP3]], align 4
 ; NO-ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; NO-ZVFBFMIN-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-ZVFBFMIN-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -138,6 +152,54 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; NO-ZVFBFMIN:       [[EXIT]]:
 ; NO-ZVFBFMIN-NEXT:    ret void
 ;
+; NO-ZVFBFMIN-PREDICATED-LABEL: define void @vfwmaccbf16.vv(
+; NO-ZVFBFMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-ZVFBFMIN-PREDICATED-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[VECTOR_PH]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[VECTOR_BODY]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <4 x bfloat>, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = load <4 x bfloat>, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = load <4 x float>, ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP4:%.*]] = fpext <4 x bfloat> [[WIDE_MASKED_LOAD]] to <4 x float>
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP5:%.*]] = fpext <4 x bfloat> [[WIDE_MASKED_LOAD3]] to <4 x float>
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[WIDE_MASKED_LOAD4]])
+; NO-ZVFBFMIN-PREDICATED-NEXT:    store <4 x float> [[TMP6]], ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[I]], 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-ZVFBFMIN-PREDICATED:       [[MIDDLE_BLOCK]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-ZVFBFMIN-PREDICATED:       [[SCALAR_PH]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[LOOP]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[A_GEP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I1]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[B_GEP1:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I1]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[C_GEP1:%.*]] = getelementptr float, ptr [[C]], i64 [[I1]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP1]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP1]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Z:%.*]] = load float, ptr [[C_GEP1]], align 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; NO-ZVFBFMIN-PREDICATED-NEXT:    store float [[FMULADD]], ptr [[C_GEP1]], align 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I_NEXT]] = add i64 [[I1]], 1
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-ZVFBFMIN-PREDICATED:       [[EXIT]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    ret void
+;
 ; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
@@ -158,16 +220,13 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP6]]
 ; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]]
 ; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]]
-; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 0
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP10]], align 2
-; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 0
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP11]], align 2
-; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP7]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP8]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
 ; ZVFBFMIN-NEXT:    [[TMP13:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD]] to <vscale x 4 x float>
 ; ZVFBFMIN-NEXT:    [[TMP14:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD1]] to <vscale x 4 x float>
 ; ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD2]])
-; ZVFBFMIN-NEXT:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP9]], align 4
 ; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP6]], [[TMP5]]
 ; ZVFBFMIN-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFBFMIN-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -221,6 +280,11 @@ exit:
 ; NO-ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; NO-ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ;.
+; NO-ZVFBFMIN-PREDICATED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-ZVFBFMIN-PREDICATED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-ZVFBFMIN-PREDICATED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-ZVFBFMIN-PREDICATED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
 ; ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index f8b83ff..aad9128 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -595,11 +595,10 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i16> splat (i16 99), <vscale x 8 x i16> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 8 x i16> [[PREDPHI]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <vscale x 8 x i16> [[PREDPHI]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -674,11 +673,10 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i16> [[WIDE_LOAD]], <vscale x 8 x i16> splat (i16 99)
-; CHECK-NEXT:    store <vscale x 8 x i16> [[PREDPHI]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <vscale x 8 x i16> [[PREDPHI]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP9]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 22c56c8..ab8875b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -248,8 +248,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
 ; CHECK:       [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP20]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META14]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META14]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
index a2faaaa..db3215a6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
@@ -30,10 +30,9 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -91,8 +90,7 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index c6661cb..f02e5de 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -27,10 +27,9 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -62,21 +61,20 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[TMP4:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -127,10 +125,9 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -162,21 +159,20 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[TMP4:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -227,10 +223,9 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = urem <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -262,21 +257,20 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[TMP4:%.*]] = urem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = urem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -327,10 +321,9 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = srem <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -362,21 +355,20 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[TMP4:%.*]] = srem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = srem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
-; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP1]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[TMP5]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -429,11 +421,10 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -473,23 +464,22 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
 ; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -554,11 +544,10 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -598,23 +587,22 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP5]]
 ; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP4]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -675,12 +663,11 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP10:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 27)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -716,9 +703,8 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42)
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
@@ -726,15 +712,15 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27)
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP1]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -795,12 +781,11 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP10:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 27)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -836,9 +821,8 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], splat (i64 42)
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], splat (i64 42)
@@ -846,15 +830,15 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], splat (i64 27)
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[TMP6]], <4 x i64> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 8
+; FIXED-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP1]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP3]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -915,13 +899,12 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 -128)
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[WIDE_LOAD]]
-; CHECK-NEXT:    store <vscale x 16 x i8> [[PREDPHI]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store <vscale x 16 x i8> [[PREDPHI]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -957,9 +940,8 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; FIXED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 32
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
+; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP3]], align 1
 ; FIXED-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD]], splat (i8 -128)
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], splat (i8 -128)
@@ -969,15 +951,15 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP9:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP7]]
 ; FIXED-NEXT:    [[PREDPHI:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> [[TMP8]], <32 x i8> [[WIDE_LOAD]]
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP9]], <32 x i8> [[WIDE_LOAD1]]
-; FIXED-NEXT:    store <32 x i8> [[PREDPHI]], ptr [[TMP2]], align 1
+; FIXED-NEXT:    store <32 x i8> [[PREDPHI]], ptr [[TMP1]], align 1
 ; FIXED-NEXT:    store <32 x i8> [[PREDPHI2]], ptr [[TMP3]], align 1
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[FOR_END:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
 ; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
index 25f52b2..5d032e6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
 
 ; Make sure we do not vectorize a loop with a widened int induction.
@@ -33,8 +33,7 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
index 0b307c2..53e43e1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s -check-prefix=NO-ZVFHMIN-PREDICATED
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN
 
 define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
@@ -21,6 +22,24 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; NO-ZVFHMIN:       [[EXIT]]:
 ; NO-ZVFHMIN-NEXT:    ret void
 ;
+; NO-ZVFHMIN-PREDICATED-LABEL: define void @fadd(
+; NO-ZVFHMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFHMIN-PREDICATED-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFHMIN-PREDICATED:       [[LOOP]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[X:%.*]] = load half, ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[Z:%.*]] = fadd half [[X]], [[Y]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    store half [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-ZVFHMIN-PREDICATED:       [[EXIT]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    ret void
+;
 ; ZVFHMIN-LABEL: define void @fadd(
 ; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFHMIN-NEXT:  [[ENTRY:.*]]:
@@ -40,12 +59,10 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFHMIN-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr [[TMP1]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP3]], align 2
-; ZVFHMIN-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[TMP2]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP1]], align 2
+; ZVFHMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP2]], align 2
 ; ZVFHMIN-NEXT:    [[TMP11:%.*]] = fadd <vscale x 8 x half> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; ZVFHMIN-NEXT:    store <vscale x 8 x half> [[TMP11]], ptr [[TMP3]], align 2
+; ZVFHMIN-NEXT:    store <vscale x 8 x half> [[TMP11]], ptr [[TMP1]], align 2
 ; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 1f6d71e..ce58ae1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -24,12 +24,10 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
index 283688c..c9ba2af 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll
@@ -37,15 +37,12 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -103,15 +100,12 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFHMIN-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; ZVFHMIN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
+; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
 ; ZVFHMIN-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x float> @llvm.minimumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD5]])
 ; ZVFHMIN-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[TMP18]], i32 0
-; ZVFHMIN-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP19]], align 4
+; ZVFHMIN-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP18]], align 4
 ; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; ZVFHMIN-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -192,15 +186,12 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -258,15 +249,12 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFHMIN-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP14]], align 4
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
 ; ZVFHMIN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[TMP15]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP16]], align 4
+; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4
 ; ZVFHMIN-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x float> @llvm.maximumnum.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD5]])
 ; ZVFHMIN-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[TMP18]], i32 0
-; ZVFHMIN-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP19]], align 4
+; ZVFHMIN-NEXT:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP18]], align 4
 ; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; ZVFHMIN-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -347,15 +335,12 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -413,15 +398,12 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFHMIN-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw double, ptr [[TMP13]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP14]], align 8
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP13]], align 8
 ; ZVFHMIN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw double, ptr [[TMP15]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP16]], align 8
+; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP15]], align 8
 ; ZVFHMIN-NEXT:    [[TMP17:%.*]] = call <vscale x 2 x double> @llvm.minimumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD5]])
 ; ZVFHMIN-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw double, ptr [[TMP18]], i32 0
-; ZVFHMIN-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP19]], align 8
+; ZVFHMIN-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP18]], align 8
 ; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; ZVFHMIN-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -502,15 +484,12 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -568,15 +547,12 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFHMIN-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw double, ptr [[TMP13]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP14]], align 8
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP13]], align 8
 ; ZVFHMIN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw double, ptr [[TMP15]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP16]], align 8
+; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x double>, ptr [[TMP15]], align 8
 ; ZVFHMIN-NEXT:    [[TMP17:%.*]] = call <vscale x 2 x double> @llvm.maximumnum.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x double> [[WIDE_LOAD5]])
 ; ZVFHMIN-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw double, ptr [[TMP18]], i32 0
-; ZVFHMIN-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP19]], align 8
+; ZVFHMIN-NEXT:    store <vscale x 2 x double> [[TMP17]], ptr [[TMP18]], align 8
 ; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; ZVFHMIN-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -657,15 +633,12 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -723,15 +696,12 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFHMIN-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw half, ptr [[TMP13]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP14]], align 2
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP13]], align 2
 ; ZVFHMIN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw half, ptr [[TMP15]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP16]], align 2
+; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP15]], align 2
 ; ZVFHMIN-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x half> @llvm.minimumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD5]])
 ; ZVFHMIN-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw half, ptr [[TMP18]], i32 0
-; ZVFHMIN-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP19]], align 2
+; ZVFHMIN-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP18]], align 2
 ; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; ZVFHMIN-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -812,15 +782,12 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x half> @llvm.maximumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -878,15 +845,12 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; ZVFHMIN-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw half, ptr [[TMP13]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP14]], align 2
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP13]], align 2
 ; ZVFHMIN-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw half, ptr [[TMP15]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP16]], align 2
+; ZVFHMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x half>, ptr [[TMP15]], align 2
 ; ZVFHMIN-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x half> @llvm.maximumnum.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD5]])
 ; ZVFHMIN-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; ZVFHMIN-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw half, ptr [[TMP18]], i32 0
-; ZVFHMIN-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP19]], align 2
+; ZVFHMIN-NEXT:    store <vscale x 8 x half> [[TMP17]], ptr [[TMP18]], align 2
 ; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
 ; ZVFHMIN-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index d485a743..5b2d308 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP
-; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-OUTLOOP %s
-; RUN: opt -passes=loop-vectorize -prefer-inloop-reductions -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-INLOOP %s
+; RUN: opt -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-OUTLOOP %s
+; RUN: opt -passes=loop-vectorize -prefer-inloop-reductions -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL-INLOOP %s
 
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
@@ -30,8 +30,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; OUTLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; OUTLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP8]], align 2
+; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
 ; OUTLOOP-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; OUTLOOP-NEXT:    [[TMP10]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP9]]
 ; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
@@ -83,8 +82,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; INLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; INLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP8]], align 2
+; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP7]], align 2
 ; INLOOP-NEXT:    [[TMP9:%.*]] = sext <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; INLOOP-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP9]])
 ; INLOOP-NEXT:    [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
@@ -138,8 +136,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; IF-EVL-OUTLOOP-NEXT:    [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[EVL_BASED_IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP8]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[VP_OP_LOAD]] to <vscale x 4 x i32>
 ; IF-EVL-OUTLOOP-NEXT:    [[VP_OP:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP9]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP10]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP5]])
@@ -194,8 +191,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 8, i1 true)
 ; IF-EVL-INLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[EVL_BASED_IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP8]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i16> [[VP_OP_LOAD]] to <vscale x 8 x i32>
 ; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP14]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
@@ -267,8 +263,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; OUTLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; OUTLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; OUTLOOP-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; OUTLOOP-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -314,8 +309,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
 ; INLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; INLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; INLOOP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; INLOOP-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
 ; INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -364,8 +358,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
@@ -415,8 +408,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-INLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 2147483647, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-INLOOP-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP13]], i32 [[VEC_PHI]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 85ccbab..e226eea 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -78,9 +78,9 @@ define void @load_store_factor2_i32(ptr %p) {
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -252,9 +252,9 @@ define void @load_store_factor2_i64(ptr %p) {
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -437,9 +437,9 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -640,9 +640,9 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -852,9 +852,9 @@ define void @load_store_factor4(ptr %p) {
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1090,9 +1090,9 @@ define void @load_store_factor5(ptr %p) {
 ; FIXED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1353,9 +1353,9 @@ define void @load_store_factor6(ptr %p) {
 ; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1645,9 +1645,9 @@ define void @load_store_factor7(ptr %p) {
 ; FIXED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1963,9 +1963,9 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -2192,8 +2192,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -2241,17 +2240,16 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; FIXED-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0
 ; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP7]], i32 8
-; FIXED-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP8]], align 4
+; FIXED-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP7]], align 4
 ; FIXED-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -2294,8 +2292,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[TMP9]]
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
-; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP12]], align 4
+; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP11]], align 4
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -2372,8 +2369,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 2 x i64> [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -2421,17 +2417,16 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
 ; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP7]], i32 4
-; FIXED-NEXT:    store <4 x i64> [[TMP5]], ptr [[TMP8]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP5]], ptr [[TMP7]], align 8
 ; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-NEXT:    br label [[EXIT:%.*]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       loop:
 ; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -2474,8 +2469,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = add <vscale x 2 x i64> [[TMP8]], [[TMP9]]
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP12]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index dbe6f27..c35a6ab 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 2
 ; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=scalar-epilogue %s 2>&1 | FileCheck %s -check-prefix=SCALAR_EPILOGUE
-; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA
-; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA-WITH-EVL
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA
+; RUN: opt -mtriple=riscv64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+v -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_DATA-WITH-EVL
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
index 4d59f27..93e0f90 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
@@ -20,10 +20,9 @@ define void @load_store(ptr %p) {
 ; LMUL1:       vector.body:
 ; LMUL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[INDEX]]
-; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
+; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP3]], align 8
 ; LMUL1-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[WIDE_LOAD]], splat (i64 1)
-; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
+; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP3]], align 8
 ; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; LMUL1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; LMUL1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -62,10 +61,9 @@ define void @load_store(ptr %p) {
 ; LMUL2:       vector.body:
 ; LMUL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[INDEX]]
-; LMUL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8
 ; LMUL2-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 1)
-; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP5]], align 8
 ; LMUL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; LMUL2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; LMUL2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -104,10 +102,9 @@ define void @load_store(ptr %p) {
 ; LMUL4:       vector.body:
 ; LMUL4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[INDEX]]
-; LMUL4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 8
+; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP5]], align 8
 ; LMUL4-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[WIDE_LOAD]], splat (i64 1)
-; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP5]], align 8
 ; LMUL4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; LMUL4-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; LMUL4-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -146,10 +143,9 @@ define void @load_store(ptr %p) {
 ; LMUL8:       vector.body:
 ; LMUL8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[INDEX]]
-; LMUL8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 8
+; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP5]], align 8
 ; LMUL8-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], splat (i64 1)
-; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 8
+; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP5]], align 8
 ; LMUL8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; LMUL8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; LMUL8-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index e2641ab..0a87257 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -134,14 +134,11 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 8, i32 4, i1 true)
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr align 1 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP9:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shl <vscale x 4 x i8> [[VP_OP_LOAD]], splat (i8 1)
+; CHECK-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP12:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i8> [[TMP6]], [[VP_OP_LOAD1]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP7]], ptr align 1 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
@@ -150,10 +147,10 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
@@ -190,26 +187,23 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1:%.*]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <16 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4:%.*]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[TMP2]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP4]], align 1
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
@@ -247,26 +241,23 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP1:%.*]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <32 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP4:%.*]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <32 x i8> [[TMP2]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <32 x i8> [[TMP5]], ptr [[TMP4]], align 1
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
@@ -305,21 +296,19 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], splat (i8 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
@@ -377,8 +366,7 @@ define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) {
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = icmp ule <16 x i64> [[VEC_IV]], splat (i64 9)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index 4ccc45d4..10ba208 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -36,13 +36,11 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VLENUNK-NEXT:    [[TMP13:%.*]] = icmp ult <vscale x 4 x i64> [[VEC_IND]], splat (i64 512)
 ; VLENUNK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; VLENUNK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0
-; VLENUNK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> poison)
+; VLENUNK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> poison)
 ; VLENUNK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; VLENUNK-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[PREDPHI]], [[BROADCAST_SPLAT]]
 ; VLENUNK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; VLENUNK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; VLENUNK-NEXT:    store <vscale x 4 x i32> [[TMP17]], ptr [[TMP19]], align 4
+; VLENUNK-NEXT:    store <vscale x 4 x i32> [[TMP17]], ptr [[TMP18]], align 4
 ; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; VLENUNK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; VLENUNK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll
index 4f3b8d8..0afe04e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/only-compute-cost-for-vplan-vfs.ll
@@ -1,5 +1,5 @@
-; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/ordered-reduction.ll
index 6668cd6..9ec8ba4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/ordered-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/ordered-reduction.ll
@@ -20,8 +20,7 @@ define float @fadd(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP3]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -58,8 +57,7 @@ define float @fadd(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-UNORDERED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP3]] = fadd <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index b8d5cbd..ee6b950 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -26,12 +26,10 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; V-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; V-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
@@ -63,12 +61,10 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
@@ -92,16 +88,14 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-V-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-V-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -115,7 +109,7 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-V:       middle.block:
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-V:       scalar.ph:
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdot(
@@ -129,16 +123,14 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -152,7 +144,7 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       middle.block:
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
-; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
@@ -198,12 +190,10 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; V-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; V-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
@@ -235,12 +225,10 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
@@ -264,16 +252,14 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-V-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-V-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -287,7 +273,7 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-V:       middle.block:
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-V:       scalar.ph:
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotu(
@@ -301,16 +287,14 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -324,7 +308,7 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       middle.block:
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
-; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
@@ -370,12 +354,10 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; V-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; V-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
@@ -407,12 +389,10 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
@@ -436,16 +416,14 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-V-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-V-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -459,7 +437,7 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-V:       middle.block:
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-V:       scalar.ph:
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu(
@@ -473,16 +451,14 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -496,7 +472,7 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       middle.block:
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
-; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
@@ -541,12 +517,10 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; V-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; V-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
@@ -578,12 +552,10 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
@@ -607,16 +579,14 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-V-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-V-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-V-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -630,7 +600,7 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-V:       middle.block:
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-V:       scalar.ph:
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu2(
@@ -644,16 +614,14 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
@@ -667,7 +635,7 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ:       middle.block:
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
-; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
 ; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
index 8088a65..b5b62d0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
@@ -49,8 +49,7 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 8 x i1> [[TMP21]], i32 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP23]], i64 poison, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[PREDPHI]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> zeroinitializer, ptr [[TMP25]], i32 2, <vscale x 8 x i1> [[TMP22]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> zeroinitializer, ptr [[TMP24]], i32 2, <vscale x 8 x i1> [[TMP22]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
index e4892db..f4817bb 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
@@ -1,6 +1,5 @@
 ; RUN: opt -passes=debugify,loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -S < %s 2>&1 | FileCheck --check-prefix=DEBUGLOC %s
 
 ; Testing the debug locations of the generated vector intrinsic is same as
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
index 0a367c0..642eef5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll
@@ -21,15 +21,12 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r
 ; LMUL1:       vector.body:
 ; LMUL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; LMUL1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
 ; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; LMUL1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; LMUL1-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; LMUL1-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; LMUL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; LMUL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; LMUL1-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP8]], align 4
+; LMUL1-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP7]], align 4
 ; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; LMUL1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; LMUL1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -72,15 +69,12 @@ define ptr @array_add(ptr noalias nocapture readonly %a, ptr noalias nocapture r
 ; LMUL2:       vector.body:
 ; LMUL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; LMUL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; LMUL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
 ; LMUL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; LMUL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; LMUL2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
+; LMUL2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; LMUL2-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; LMUL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; LMUL2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; LMUL2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP8]], align 4
+; LMUL2-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP7]], align 4
 ; LMUL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; LMUL2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; LMUL2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index f59ab56..0b3dcf8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -986,7 +986,7 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
 ; RV64-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-NEXT:    br label %[[SCALAR_PH]]
 ; RV64:       [[SCALAR_PH]]:
 ; RV64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
 ; RV64-NEXT:    br label %[[FOR_BODY:.*]]
@@ -1038,7 +1038,7 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
 ; RV32-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV32-NEXT:    br label %[[SCALAR_PH]]
 ; RV32:       [[SCALAR_PH]]:
 ; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
 ; RV32-NEXT:    br label %[[FOR_BODY:.*]]
@@ -1123,7 +1123,7 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
 ; RV64-UF2-NEXT:    br i1 [[TMP58]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RV64-UF2:       [[MIDDLE_BLOCK]]:
-; RV64-UF2-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; RV64-UF2-NEXT:    br label %[[SCALAR_PH]]
 ; RV64-UF2:       [[SCALAR_PH]]:
 ; RV64-UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
 ; RV64-UF2-NEXT:    br label %[[FOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index a901fe7..3370e92 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -25,12 +25,10 @@ define void @test(ptr %p) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 32
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 32
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 200
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP11]], align 32
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP10]], align 32
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -82,19 +80,17 @@ define void @test_may_clobber(ptr %p) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 100
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -146,12 +142,10 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 32
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 32
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 8192
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP11]], align 32
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP10]], align 32
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -211,12 +205,10 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 32
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 32
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1024
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP11]], align 32
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP10]], align 32
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
index c3c4abb..e51f6fa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
@@ -25,10 +25,9 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP8]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP8]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -90,10 +89,9 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -193,8 +191,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -254,8 +251,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP9]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
@@ -324,8 +320,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -380,8 +375,7 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index b046f61..f6f8f6a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=CHECK
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=CHECK
 
 ; Exercise tail folding on RISCV w/scalable vectors.
 
@@ -26,14 +26,13 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[VP_OP_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP7]], ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,8 +94,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
@@ -159,8 +157,7 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
@@ -231,12 +228,11 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP8]])
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -293,10 +289,9 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -335,17 +330,44 @@ for.end:
 define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
 ; CHECK-LABEL: @uniform_load(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[B]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; CHECK-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 1025
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], [[FOR_BODY1]] ], [ [[V]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[V_LCSSA]]
 ;
 entry:
@@ -385,17 +407,16 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP12]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[VP_OP_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP7]], ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
@@ -409,7 +430,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index 01b4ad2..745b8ba 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -17,8 +17,7 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -45,8 +44,7 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp sge <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP8]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
@@ -88,8 +86,7 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast uge <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -116,8 +113,7 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast uge <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP8]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
@@ -157,8 +153,7 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -183,8 +178,7 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP8]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
@@ -224,8 +218,7 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -250,8 +243,7 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP8]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
@@ -291,8 +283,7 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -317,8 +308,7 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast one <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP8]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
@@ -383,12 +373,10 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 35)
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP5]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[VEC_PHI]], [[TMP8]]
 ; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP9]], <4 x i1> [[VEC_PHI]]
@@ -414,12 +402,10 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 35)
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
-; SCALABLE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
+; SCALABLE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 2)
 ; SCALABLE-NEXT:    [[TMP13:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP12]]
 ; SCALABLE-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index d9598bc..13a4b16 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -8,20 +8,18 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
@@ -55,20 +53,18 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[IV]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 25dac36..80f0274 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -250,10 +250,9 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NOSTRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]]
-; NOSTRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NOSTRIDED-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP8]], align 4
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -332,10 +331,9 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NOSTRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]]
-; NOSTRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NOSTRIDED-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP8]], align 4
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -463,12 +461,10 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NOSTRIDED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P]], i64 [[INDEX]]
-; NOSTRIDED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
 ; NOSTRIDED-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; NOSTRIDED-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[P2]], i64 [[INDEX]]
-; NOSTRIDED-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i32 0
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP15]], align 4
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; NOSTRIDED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -636,10 +632,9 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED:       vector.body:
 ; NOSTRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NOSTRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[INDEX]]
-; NOSTRIDED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
-; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NOSTRIDED-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
-; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP9]], align 4
+; NOSTRIDED-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP8]], align 4
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll
index 16c575f..f9395aa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=NO-VP
 
 
@@ -39,12 +37,10 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = and <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -71,9 +67,45 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_and(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = and <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -81,7 +113,7 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -132,12 +164,10 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = or <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -164,9 +194,45 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_or(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = or <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -174,7 +240,7 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -225,12 +291,10 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = xor <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -257,9 +321,45 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_xor(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = xor <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -267,7 +367,7 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -318,12 +418,10 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = shl <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -350,9 +448,45 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_shl(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = shl <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -360,7 +494,7 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -411,12 +545,10 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = lshr <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -443,9 +575,45 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_lshr(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = lshr <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -453,7 +621,7 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -504,12 +672,10 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = ashr <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -536,9 +702,45 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_ashr(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = ashr <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -546,7 +748,7 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -597,12 +799,10 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = add <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -629,9 +829,45 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_add(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -639,7 +875,7 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -690,12 +926,10 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = sub <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 1)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -722,9 +956,45 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_sub(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = sub <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 1)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -732,7 +1002,7 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -783,12 +1053,10 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = mul <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 3)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -815,9 +1083,45 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_mul(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 3)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -825,7 +1129,7 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -876,12 +1180,10 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = sdiv <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 3)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -908,9 +1210,45 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_sdiv(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 3)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -918,7 +1256,7 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -969,12 +1307,10 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = udiv <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 3)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -1001,9 +1337,45 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_udiv(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = udiv <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 3)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -1011,7 +1383,7 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1062,12 +1434,10 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = srem <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 3)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -1094,9 +1464,45 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_srem(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = srem <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 3)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -1104,7 +1510,7 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1155,12 +1561,10 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = urem <vscale x 16 x i8> [[VP_OP_LOAD]], splat (i8 3)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP17]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_OP]], ptr align 1 [[TMP16]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -1187,9 +1591,45 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_urem(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 16
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.umax.i64(i64 32, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 16
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = urem <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 3)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP11]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -1197,7 +1637,7 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1251,12 +1691,10 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = fadd fast <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -1283,9 +1721,46 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_fadd(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP15]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fadd fast <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
@@ -1293,7 +1768,7 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1345,12 +1820,10 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = fsub fast <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -1377,9 +1850,46 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_fsub(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP15]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fsub fast <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
@@ -1387,7 +1897,7 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1439,12 +1949,10 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = fmul fast <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -1471,9 +1979,46 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_fmul(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP15]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fmul fast <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
@@ -1481,7 +2026,7 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1533,12 +2078,10 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = fdiv fast <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -1565,9 +2108,46 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_fdiv(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP15]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fdiv fast <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
@@ -1575,7 +2155,7 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1680,12 +2260,10 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = fneg fast <vscale x 4 x float> [[VP_OP_LOAD]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[VP_OP]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -1712,9 +2290,46 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-LABEL: define void @test_fneg(
 ; NO-VP-SAME: ptr captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP15]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[B1]], [[A2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fneg fast <vscale x 4 x float> [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[DEC]] = add nsw i64 [[LEN]], 1
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
@@ -1722,7 +2337,7 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
 ; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
 ; NO-VP-NEXT:    store float [[TMP]], ptr [[ARRAYIDX1]], align 4
 ; NO-VP-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
-; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
 ; NO-VP:       [[FINISH_LOOPEXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1784,3 +2399,42 @@ finish.loopexit:
 ; IF-EVL: [[LOOP37]] = distinct !{[[LOOP37]], [[META1]], [[META2]], [[META3]]}
 ; IF-EVL: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]]}
 ;.
+; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; NO-VP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; NO-VP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; NO-VP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; NO-VP: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+; NO-VP: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
+; NO-VP: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
+; NO-VP: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
+; NO-VP: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]}
+; NO-VP: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]]}
+; NO-VP: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]]}
+; NO-VP: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]]}
+; NO-VP: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]]}
+; NO-VP: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]]}
+; NO-VP: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP31]] = distinct !{[[LOOP31]], [[META1]]}
+; NO-VP: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP33]] = distinct !{[[LOOP33]], [[META1]]}
+; NO-VP: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP35]] = distinct !{[[LOOP35]], [[META1]]}
+; NO-VP: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP37]] = distinct !{[[LOOP37]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll
index 7a5415a..7d21c4d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=NO-VP
 
 define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
@@ -44,15 +42,12 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD5]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -81,9 +76,53 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-LABEL: define void @vp_smax(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP19]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; NO-VP-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; NO-VP-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD5]])
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
 ; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
@@ -93,7 +132,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-NEXT:    store i32 [[DOT]], ptr [[GEP11]], align 4
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -153,15 +192,12 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD5]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -190,9 +226,53 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-LABEL: define void @vp_smin(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP19]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; NO-VP-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; NO-VP-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD5]])
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
 ; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
@@ -202,7 +282,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-NEXT:    store i32 [[DOT]], ptr [[GEP11]], align 4
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -262,15 +342,12 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD5]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -299,9 +376,53 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-LABEL: define void @vp_umax(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP19]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; NO-VP-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; NO-VP-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD5]])
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
 ; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
@@ -311,7 +432,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-NEXT:    store i32 [[DOT]], ptr [[GEP11]], align 4
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -371,15 +492,12 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD5]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -408,9 +526,53 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-LABEL: define void @vp_umin(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[C3:%.*]] = ptrtoint ptr [[C]] to i64
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP19]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = sub i64 [[A1]], [[C3]]
+; NO-VP-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; NO-VP-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; NO-VP-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD5]])
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
 ; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
@@ -420,7 +582,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; NO-VP-NEXT:    store i32 [[DOT]], ptr [[GEP11]], align 4
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -476,12 +638,10 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -508,17 +668,54 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_ctlz(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP15]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], i1 true)
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[TMP1:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 true)
-; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[TMP1]], ptr [[GEP3]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -571,12 +768,10 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP13]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP13]])
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = call <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], i1 true)
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP17]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP13]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP17]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP13]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP13]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]]
@@ -603,17 +798,54 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_cttz(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP15]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], i1 true)
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[TMP1:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[TMP0]], i1 true)
-; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[TMP1]], ptr [[GEP3]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -666,14 +898,12 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = fpext <vscale x 4 x float> [[VP_OP_LOAD]] to <vscale x 4 x double>
 ; IF-EVL-NEXT:    [[TMP28:%.*]] = call <vscale x 4 x i64> @llvm.lrint.nxv4i64.nxv4f64(<vscale x 4 x double> [[TMP27]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = trunc <vscale x 4 x i64> [[TMP28]] to <vscale x 4 x i32>
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -702,19 +932,58 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_lrint(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP17]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fpext <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x double>
+; NO-VP-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.lrint.nxv4i64.nxv4f64(<vscale x 4 x double> [[TMP12]])
+; NO-VP-NEXT:    [[TMP14:%.*]] = trunc <vscale x 4 x i64> [[TMP13]] to <vscale x 4 x i32>
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV2:%.*]] = fpext float [[TMP0]] to double
 ; NO-VP-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.lrint.i64.f64(double [[CONV2]])
 ; NO-VP-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP1]] to i32
-; NO-VP-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[CONV3]], ptr [[GEP5]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -769,14 +1038,12 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = fpext <vscale x 4 x float> [[VP_OP_LOAD]] to <vscale x 4 x double>
 ; IF-EVL-NEXT:    [[TMP28:%.*]] = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f64(<vscale x 4 x double> [[TMP27]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = trunc <vscale x 4 x i64> [[TMP28]] to <vscale x 4 x i32>
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -805,19 +1072,58 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_llrint(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP17:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP17]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fpext <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x double>
+; NO-VP-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.llrint.nxv4i64.nxv4f64(<vscale x 4 x double> [[TMP12]])
+; NO-VP-NEXT:    [[TMP14:%.*]] = trunc <vscale x 4 x i64> [[TMP13]] to <vscale x 4 x i32>
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP15]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV2:%.*]] = fpext float [[TMP0]] to double
 ; NO-VP-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.llrint.i64.f64(double [[CONV2]])
 ; NO-VP-NEXT:    [[CONV3:%.*]] = trunc i64 [[TMP1]] to i32
-; NO-VP-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[CONV3]], ptr [[GEP5]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -872,12 +1178,10 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -904,17 +1208,54 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_abs(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.abs.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], i1 true)
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[COND:%.*]] = tail call i32 @llvm.abs.i32(i32 [[TMP0]], i1 true)
-; NO-VP-NEXT:    [[GEP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[COND]], ptr [[GEP9]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -959,17 +1300,44 @@ define void @log10(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @log10(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.log10.v4f32(<4 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP3]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; NO-VP-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[COND:%.*]] = tail call float @llvm.log10.f32(float [[TMP0]])
-; NO-VP-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store float [[COND]], ptr [[GEP9]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1025,3 +1393,26 @@ declare i32 @llvm.abs.i32(i32, i1 immarg)
 ; IF-EVL: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]], [[META3]]}
 ; IF-EVL: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]}
 ;.
+; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; NO-VP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; NO-VP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; NO-VP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; NO-VP: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+; NO-VP: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
+; NO-VP: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]}
+; NO-VP: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
+; NO-VP: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]}
+; NO-VP: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
index 091eb87..4b718dd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=NO-VP
 
 define void @vp_sext(ptr %a, ptr %b, i64 %N) {
@@ -39,12 +37,10 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr align 4 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr align 4 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = sext <vscale x 2 x i32> [[VP_OP_LOAD]] to <vscale x 2 x i64>
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr align 8 [[TMP18]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr align 8 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
@@ -71,17 +67,54 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_sext(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP9]], 2
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 20, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 3
+; NO-VP-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 2
+; NO-VP-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; NO-VP-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; NO-VP-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; NO-VP-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NO-VP-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[GEP]], align 4, !alias.scope [[META0:![0-9]+]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = sext <vscale x 2 x i32> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], align 8, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP0]] to i64
-; NO-VP-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i64 [[CONV2]], ptr [[GEP4]], align 8
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -133,12 +166,10 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr align 4 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META10:![0-9]+]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr align 4 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META10:![0-9]+]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext <vscale x 2 x i32> [[VP_OP_LOAD]] to <vscale x 2 x i64>
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr align 8 [[TMP18]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META13:![0-9]+]], !noalias [[META10]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr align 8 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META13:![0-9]+]], !noalias [[META10]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
@@ -165,17 +196,54 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_zext(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP9]], 2
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 20, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 3
+; NO-VP-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 2
+; NO-VP-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; NO-VP-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; NO-VP-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; NO-VP-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NO-VP-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[GEP]], align 4, !alias.scope [[META9:![0-9]+]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = zext <vscale x 2 x i32> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], align 8, !alias.scope [[META12:![0-9]+]], !noalias [[META9]]
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0]] to i64
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i64 [[CONV]], ptr [[GEP2]], align 8
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -227,12 +295,10 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META17:![0-9]+]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META17:![0-9]+]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = trunc <vscale x 2 x i64> [[VP_OP_LOAD]] to <vscale x 2 x i32>
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP16]], ptr align 4 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
@@ -259,17 +325,54 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_trunc(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP9]], 2
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 20, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 2
+; NO-VP-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 3
+; NO-VP-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; NO-VP-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; NO-VP-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; NO-VP-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NO-VP-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[GEP]], align 8, !alias.scope [[META16:![0-9]+]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = trunc <vscale x 2 x i64> [[WIDE_LOAD]] to <vscale x 2 x i32>
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 2 x i32> [[TMP10]], ptr [[TMP11]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META16]]
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[GEP]], align 8
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[GEP1]], align 8
 ; NO-VP-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[CONV]], ptr [[GEP2]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -321,12 +424,10 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x float> @llvm.vp.load.nxv2f32.p0(ptr align 4 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META24:![0-9]+]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x float> @llvm.vp.load.nxv2f32.p0(ptr align 4 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META24:![0-9]+]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = fpext <vscale x 2 x float> [[VP_OP_LOAD]] to <vscale x 2 x double>
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr align 8 [[TMP18]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META24]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr align 8 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META24]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
@@ -353,17 +454,54 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_fpext(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP9]], 2
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 3
+; NO-VP-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 2
+; NO-VP-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; NO-VP-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; NO-VP-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; NO-VP-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NO-VP-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, ptr [[GEP]], align 4, !alias.scope [[META23:![0-9]+]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = fpext <vscale x 2 x float> [[WIDE_LOAD]] to <vscale x 2 x double>
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 2 x double> [[TMP10]], ptr [[TMP11]], align 8, !alias.scope [[META26:![0-9]+]], !noalias [[META23]]
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV:%.*]] = fpext float [[TMP0]] to double
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store double [[CONV]], ptr [[GEP2]], align 8
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -415,12 +553,10 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META31:![0-9]+]]
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x double> @llvm.vp.load.nxv2f64.p0(ptr align 8 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META31:![0-9]+]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = fptrunc <vscale x 2 x double> [[VP_OP_LOAD]] to <vscale x 2 x float>
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2f32.p0(<vscale x 2 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META34:![0-9]+]], !noalias [[META31]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2f32.p0(<vscale x 2 x float> [[TMP16]], ptr align 4 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META34:![0-9]+]], !noalias [[META31]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
@@ -447,17 +583,54 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_fptrunc(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP9]], 2
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 2
+; NO-VP-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 3
+; NO-VP-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; NO-VP-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; NO-VP-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; NO-VP-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; NO-VP-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[GEP]], align 8, !alias.scope [[META30:![0-9]+]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = fptrunc <vscale x 2 x double> [[WIDE_LOAD]] to <vscale x 2 x float>
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 2 x float> [[TMP10]], ptr [[TMP11]], align 4, !alias.scope [[META33:![0-9]+]], !noalias [[META30]]
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load double, ptr [[GEP]], align 8
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load double, ptr [[GEP1]], align 8
 ; NO-VP-NEXT:    [[CONV:%.*]] = fptrunc double [[TMP0]] to float
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store float [[CONV]], ptr [[GEP2]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP36:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -509,12 +682,10 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = sitofp <vscale x 4 x i32> [[VP_OP_LOAD]] to <vscale x 4 x float>
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP14]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
@@ -541,17 +712,54 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_sitofp(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = sitofp <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x float>
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store float [[CONV]], ptr [[GEP2]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP38:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -603,12 +811,10 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = uitofp <vscale x 4 x i32> [[VP_OP_LOAD]] to <vscale x 4 x float>
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP14]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
@@ -635,17 +841,54 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_uitofp(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = uitofp <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x float>
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV:%.*]] = uitofp i32 [[TMP0]] to float
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store float [[CONV]], ptr [[GEP2]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP40:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -697,12 +940,10 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = fptosi <vscale x 4 x float> [[VP_OP_LOAD]] to <vscale x 4 x i32>
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP14]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
@@ -729,17 +970,54 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_fptosi(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fptosi <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV:%.*]] = fptosi float [[TMP0]] to i32
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[CONV]], ptr [[GEP2]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP42:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -791,12 +1069,10 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = fptoui <vscale x 4 x float> [[VP_OP_LOAD]] to <vscale x 4 x i32>
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP14]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
@@ -823,17 +1099,54 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_fptoui(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 4
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = fptoui <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP]], align 4
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[GEP1]], align 4
 ; NO-VP-NEXT:    [[CONV:%.*]] = fptoui float [[TMP0]] to i32
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store i32 [[CONV]], ptr [[GEP2]], align 4
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP44:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -885,12 +1198,10 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP16]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = inttoptr <vscale x 2 x i64> [[VP_OP_LOAD]] to <vscale x 2 x ptr>
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds ptr, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[TMP18]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[TMP18]], ptr align 8 [[TMP19]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP14]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
@@ -917,17 +1228,54 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_inttoptr(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B]] to i64
+; NO-VP-NEXT:    [[A1:%.*]] = ptrtoint ptr [[A]] to i64
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP11]], 2
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP15]])
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; NO-VP:       [[VECTOR_MEMCHECK]]:
+; NO-VP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
+; NO-VP-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; NO-VP-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[GEP]], align 8
+; NO-VP-NEXT:    [[TMP12:%.*]] = inttoptr <vscale x 2 x i64> [[WIDE_LOAD]] to <vscale x 2 x ptr>
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store <vscale x 2 x ptr> [[TMP12]], ptr [[TMP13]], align 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[GEP]], align 8
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[GEP1]], align 8
 ; NO-VP-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV1]]
 ; NO-VP-NEXT:    store ptr [[TMP1]], ptr [[GEP2]], align 8
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP46:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -980,8 +1328,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 2 x i64> [[VEC_IND]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = ptrtoint <vscale x 2 x ptr> [[TMP14]] to <vscale x 2 x i64>
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP16]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -1008,16 +1355,50 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
 ; NO-VP-LABEL: define void @vp_ptrtoint(
 ; NO-VP-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[ENTRY:.*]]:
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP13]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; NO-VP-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP5]]
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B]], <vscale x 2 x i64> [[VEC_IND]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = ptrtoint <vscale x 2 x ptr> [[TMP9]] to <vscale x 2 x i64>
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], align 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; NO-VP-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[GEP]] to i64
 ; NO-VP-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
 ; NO-VP-NEXT:    store i64 [[TMP0]], ptr [[GEP2]], align 8
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP48:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -1089,3 +1470,53 @@ exit:
 ; IF-EVL: [[LOOP48]] = distinct !{[[LOOP48]], [[META6]], [[META7]], [[META8]]}
 ; IF-EVL: [[LOOP49]] = distinct !{[[LOOP49]], [[META8]], [[META6]]}
 ;.
+; NO-VP: [[META0]] = !{[[META1:![0-9]+]]}
+; NO-VP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; NO-VP: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; NO-VP: [[META3]] = !{[[META4:![0-9]+]]}
+; NO-VP: [[META4]] = distinct !{[[META4]], [[META2]]}
+; NO-VP: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; NO-VP: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
+; NO-VP: [[META9]] = !{[[META10:![0-9]+]]}
+; NO-VP: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
+; NO-VP: [[META11]] = distinct !{[[META11]], !"LVerDomain"}
+; NO-VP: [[META12]] = !{[[META13:![0-9]+]]}
+; NO-VP: [[META13]] = distinct !{[[META13]], [[META11]]}
+; NO-VP: [[LOOP14]] = distinct !{[[LOOP14]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP15]] = distinct !{[[LOOP15]], [[META6]]}
+; NO-VP: [[META16]] = !{[[META17:![0-9]+]]}
+; NO-VP: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]]}
+; NO-VP: [[META18]] = distinct !{[[META18]], !"LVerDomain"}
+; NO-VP: [[META19]] = !{[[META20:![0-9]+]]}
+; NO-VP: [[META20]] = distinct !{[[META20]], [[META18]]}
+; NO-VP: [[LOOP21]] = distinct !{[[LOOP21]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP22]] = distinct !{[[LOOP22]], [[META6]]}
+; NO-VP: [[META23]] = !{[[META24:![0-9]+]]}
+; NO-VP: [[META24]] = distinct !{[[META24]], [[META25:![0-9]+]]}
+; NO-VP: [[META25]] = distinct !{[[META25]], !"LVerDomain"}
+; NO-VP: [[META26]] = !{[[META27:![0-9]+]]}
+; NO-VP: [[META27]] = distinct !{[[META27]], [[META25]]}
+; NO-VP: [[LOOP28]] = distinct !{[[LOOP28]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP29]] = distinct !{[[LOOP29]], [[META6]]}
+; NO-VP: [[META30]] = !{[[META31:![0-9]+]]}
+; NO-VP: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]]}
+; NO-VP: [[META32]] = distinct !{[[META32]], !"LVerDomain"}
+; NO-VP: [[META33]] = !{[[META34:![0-9]+]]}
+; NO-VP: [[META34]] = distinct !{[[META34]], [[META32]]}
+; NO-VP: [[LOOP35]] = distinct !{[[LOOP35]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP36]] = distinct !{[[LOOP36]], [[META6]]}
+; NO-VP: [[LOOP37]] = distinct !{[[LOOP37]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP38]] = distinct !{[[LOOP38]], [[META6]]}
+; NO-VP: [[LOOP39]] = distinct !{[[LOOP39]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP40]] = distinct !{[[LOOP40]], [[META6]]}
+; NO-VP: [[LOOP41]] = distinct !{[[LOOP41]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP42]] = distinct !{[[LOOP42]], [[META6]]}
+; NO-VP: [[LOOP43]] = distinct !{[[LOOP43]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP44]] = distinct !{[[LOOP44]], [[META6]]}
+; NO-VP: [[LOOP45]] = distinct !{[[LOOP45]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP46]] = distinct !{[[LOOP46]], [[META6]]}
+; NO-VP: [[LOOP47]] = distinct !{[[LOOP47]], [[META6]], [[META7]]}
+; NO-VP: [[LOOP48]] = distinct !{[[LOOP48]], [[META7]], [[META6]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
index 2926011..5f53cb6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
@@ -1,24 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-OUTLOOP
 
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-INLOOP
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-OUTLOOP
 
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-INLOOP
 
 define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
@@ -44,8 +40,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP10:%.*]] = sub i64 [[N]], [[EVL_BASED_IV1]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 4, i1 true)
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV1]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP19:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    [[VP_OP:%.*]] = add <vscale x 4 x i32> [[TMP19]], [[VEC_PHI]]
@@ -98,8 +93,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
 ; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
@@ -150,8 +144,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-OUTLOOP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP21:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 3)
 ; NO-VP-OUTLOOP-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-OUTLOOP-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[TMP16]], [[VEC_PHI]]
@@ -200,8 +193,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-INLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-INLOOP-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; NO-VP-INLOOP-NEXT:    [[TMP10:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-INLOOP-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
@@ -282,8 +274,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP15:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = icmp sle <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[VP_OP_LOAD]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP21:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> zeroinitializer
@@ -341,8 +332,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP19]], i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
@@ -396,8 +386,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-OUTLOOP-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
+; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP18:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 3)
 ; NO-VP-OUTLOOP-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; NO-VP-OUTLOOP-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -450,8 +439,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-INLOOP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-INLOOP-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; NO-VP-INLOOP-NEXT:    [[TMP10:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-INLOOP-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
@@ -540,8 +528,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP13]], i64 0
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV1]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP16:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
@@ -603,8 +590,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
 ; IF-EVL-INLOOP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV1]]
-; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
@@ -665,8 +651,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-OUTLOOP-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP20]], align 4
+; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP27:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-OUTLOOP-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI]]
@@ -725,8 +710,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-INLOOP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
 ; NO-VP-INLOOP-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
 ; NO-VP-INLOOP-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-INLOOP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
@@ -820,8 +804,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP21:%.*]] = icmp sle <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND2]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[VP_OP_LOAD]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP23:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> zeroinitializer
@@ -889,8 +872,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
 ; IF-EVL-INLOOP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_IND]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP15]], i32 [[TMP11]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP17]] = add i32 [[TMP16]], [[VEC_PHI]]
@@ -954,8 +936,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP11]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-OUTLOOP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP27]], align 4
+; NO-VP-OUTLOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
 ; NO-VP-OUTLOOP-NEXT:    [[TMP28:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; NO-VP-OUTLOOP-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP28]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -1018,8 +999,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-INLOOP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
 ; NO-VP-INLOOP-NEXT:    [[TMP15:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
 ; NO-VP-INLOOP-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-INLOOP-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
index cd53ea0..238aed1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cost.ll
@@ -3,7 +3,7 @@
 ; RUN:   -mtriple riscv64-linux-gnu -mattr=+v,+f -S -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=DATA
 
 ; RUN: opt < %s -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
-; RUN:   -mtriple riscv64-linux-gnu -force-tail-folding-style=data-with-evl -mattr=+v,+f -S \
+; RUN:   -mtriple riscv64-linux-gnu -mattr=+v,+f -S \
 ; RUN:   -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=EVL
 
 ; DATA: Cost of 2 for VF 2: EMIT{{.*}} = active lane mask
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
index 3e83d8a..f27f49e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=NO-VP
 
 define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
@@ -30,16 +28,13 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = sdiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -68,9 +63,38 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-LABEL: define void @test_sdiv(
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP13]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; NO-VP-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], align 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_GEP]], align 8
 ; NO-VP-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
@@ -80,7 +104,7 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-NEXT:    store i64 [[TMP2]], ptr [[C_GEP]], align 8
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -126,16 +150,13 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = udiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -164,9 +185,38 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-LABEL: define void @test_udiv(
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP13]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; NO-VP-NEXT:    [[TMP8:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], align 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_GEP]], align 8
 ; NO-VP-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
@@ -176,7 +226,7 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-NEXT:    store i64 [[TMP2]], ptr [[C_GEP]], align 8
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -221,16 +271,13 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = srem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -259,9 +306,38 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-LABEL: define void @test_srem(
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP13]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; NO-VP-NEXT:    [[TMP8:%.*]] = srem <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], align 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_GEP]], align 8
 ; NO-VP-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
@@ -271,7 +347,7 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-NEXT:    store i64 [[TMP2]], ptr [[C_GEP]], align 8
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -316,16 +392,13 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = urem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -354,9 +427,38 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-LABEL: define void @test_urem(
 ; NO-VP-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; NO-VP-NEXT:  [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP12]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP:       [[VECTOR_PH]]:
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP13]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-VP:       [[VECTOR_BODY]]:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[A]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP7]], align 8
+; NO-VP-NEXT:    [[TMP8:%.*]] = urem <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[C]], i64 [[INDEX]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], align 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP:       [[MIDDLE_BLOCK]]:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-VP:       [[SCALAR_PH]]:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; NO-VP-NEXT:    br label %[[LOOP:.*]]
 ; NO-VP:       [[LOOP]]:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; NO-VP-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_GEP]], align 8
 ; NO-VP-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
@@ -366,7 +468,7 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; NO-VP-NEXT:    store i64 [[TMP2]], ptr [[C_GEP]], align 8
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-VP-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; NO-VP:       [[EXIT]]:
 ; NO-VP-NEXT:    ret void
 ;
@@ -402,3 +504,14 @@ exit:
 ; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]], [[META3]]}
 ; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META3]], [[META1]]}
 ;.
+; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-VP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-VP: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO-VP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO-VP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; NO-VP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO-VP: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index e31d7ff..6a6bca2d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -1,14 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
 
 define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
@@ -39,13 +37,11 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[VP_OP_LOAD]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -94,13 +90,11 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
 ; NO-VP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; NO-VP-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], [[WIDE_LOAD]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP14]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -181,14 +175,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP15]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
 ; IF-EVL-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP15]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP15]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP20]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP15]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -244,14 +236,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP13]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
 ; NO-VP-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; NO-VP-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP15]], i32 -1)
 ; NO-VP-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[TMP15]], [[TMP16]]
 ; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP18]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP17]], ptr [[TMP19]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP17]], ptr [[TMP18]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -344,16 +334,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP18]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
 ; IF-EVL-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP18]])
 ; IF-EVL-NEXT:    [[TMP23]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP18]])
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP18]])
 ; IF-EVL-NEXT:    [[TMP40:%.*]] = add nsw <vscale x 4 x i32> [[TMP23]], [[TMP24]]
 ; IF-EVL-NEXT:    [[VP_OP5:%.*]] = add <vscale x 4 x i32> [[TMP40]], [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP5]], ptr align 4 [[TMP26]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP18]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -417,16 +405,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP16]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP16]], align 4
 ; NO-VP-NEXT:    [[TMP18]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; NO-VP-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP18]], i32 -1)
 ; NO-VP-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP19]], i32 -1)
 ; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP20]]
 ; NO-VP-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[TMP21]], [[TMP18]]
 ; NO-VP-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP24]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP23]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -494,20 +480,62 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define i32 @FOR_reduction(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0]] {
 ; IF-EVL-NEXT:  [[ENTRY:.*]]:
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], [[TMP1]]
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IF-EVL:       [[VECTOR_PH]]:
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], [[TMP3]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP8]]
+; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IF-EVL:       [[VECTOR_BODY]]:
+; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], [[WIDE_LOAD]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP12]], align 4
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS]], [[TMP5]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL:       [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
+; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP16]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
+; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 2
+; IF-EVL-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP19]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; IF-EVL:       [[SCALAR_PH]]:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
 ; IF-EVL:       [[FOR_BODY]]:
-; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    [[TMP0]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0]] = load i32, ptr [[ARRAYIDX1]], align 4
 ; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP0]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
+; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; IF-EVL:       [[FOR_END]]:
-; IF-EVL-NEXT:    [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret i32 [[FOR1_LCSSA]]
 ;
 ; NO-VP-LABEL: define i32 @FOR_reduction(
@@ -533,13 +561,11 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP10]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
 ; NO-VP-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; NO-VP-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], [[WIDE_LOAD]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP14]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -630,8 +656,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP20]] = add <vscale x 2 x i64> [[VEC_IND]], splat (i64 42)
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[TMP20]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP9]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP17]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
@@ -688,8 +713,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; NO-VP-NEXT:    [[TMP12]] = add <vscale x 2 x i64> [[VEC_IND]], splat (i64 42)
 ; NO-VP-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[TMP12]], i32 -1)
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP11]], i32 0
-; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP15]], align 8
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[TMP13]], ptr [[TMP11]], align 8
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NO-VP-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -749,8 +773,8 @@ for.end:
 ; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]}
 ; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]}
 ; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META3]], [[META1]]}
-; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]]}
-; IF-EVL: [[META10]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META3]]}
+; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META3]], [[META1]]}
 ; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]], [[META3]]}
 ; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
index 2b7a9fb..892293b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
 
 define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
@@ -69,18 +67,54 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde
 ;
 ; NO-VP-LABEL: @gather_scatter(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP14]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; NO-VP-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP5]]
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], <vscale x 2 x i64> [[VEC_IND]]
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP9]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> [[TMP10]], i32 4, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x float> poison)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> [[TMP11]], i32 4, <vscale x 2 x i1> splat (i1 true))
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], [[TMP5]]
+; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY1:%.*]]
 ; NO-VP:       for.body:
-; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV1]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP0]]
 ; NO-VP-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-VP-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP0]]
 ; NO-VP-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
-; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT1]] = add nuw nsw i64 [[INDVARS_IV1]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       for.end:
 ; NO-VP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll
index 87ac697..e0594ad 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll
@@ -1,14 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
 
 define i32 @add(ptr %a, i64 %n, i32 %start) {
@@ -32,8 +30,7 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
@@ -78,8 +75,7 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP10]] = add i32 [[TMP9]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -126,36 +122,34 @@ for.end:
 define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @mul(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 8
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison)
-; IF-EVL-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> splat (i32 1)
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]])
 ; IF-EVL-NEXT:    [[MUL]] = mul i32 [[TMP5]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD2]])
+; IF-EVL-NEXT:    [[TMP6]] = mul i32 [[TMP8]], [[VEC_PHI1]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 8
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[BIN_RDX:%.*]] = mul i32 [[TMP6]], [[MUL]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -164,10 +158,10 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MUL1]] = mul nsw i32 [[TMP0]], [[RDX1]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[FOR_BODY1]] ], [ [[MUL]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[FOR_BODY1]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret i32 [[MUL_LCSSA]]
 ;
 ; NO-VP-LABEL: @mul(
@@ -183,9 +177,8 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP7]] = mul i32 [[TMP6]], [[VEC_PHI]]
@@ -253,8 +246,7 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.or.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15]] = or i32 [[TMP14]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
@@ -299,8 +291,7 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP10]] = or i32 [[TMP9]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -364,8 +355,7 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.and.nxv4i32(i32 -1, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15]] = and i32 [[TMP14]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
@@ -410,8 +400,7 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP10]] = and i32 [[TMP9]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -475,8 +464,7 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.xor.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15]] = xor i32 [[TMP14]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
@@ -521,8 +509,7 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP10]] = xor i32 [[TMP9]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -586,8 +573,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 2147483647, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -633,8 +619,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -700,8 +685,7 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 -2147483648, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -747,8 +731,7 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smax.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -814,8 +797,7 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.umin.nxv4i32(i32 -1, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -861,8 +843,7 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umin.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -928,8 +909,7 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vp.reduce.umax.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP14]], i32 [[VEC_PHI]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -975,8 +955,7 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP9]], i32 [[VEC_PHI]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1042,8 +1021,7 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15]] = fadd reassoc float [[TMP14]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP10]] to i64
@@ -1088,8 +1066,7 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP10]] = fadd reassoc float [[TMP9]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1136,36 +1113,34 @@ for.end:
 define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fmul(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 8
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison)
-; IF-EVL-NEXT:    [[TMP4:%.*]] = select reassoc <4 x i1> [[TMP1]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> splat (float 1.000000e+00)
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
 ; IF-EVL-NEXT:    [[MUL]] = fmul reassoc float [[TMP5]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD2]])
+; IF-EVL-NEXT:    [[TMP6]] = fmul reassoc float [[TMP8]], [[VEC_PHI1]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 8
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[BIN_RDX:%.*]] = fmul reassoc float [[TMP6]], [[MUL]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -1174,10 +1149,10 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MUL1]] = fmul reassoc float [[TMP0]], [[RDX1]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP24:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL1]], [[FOR_BODY1]] ], [ [[MUL]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL1]], [[FOR_BODY1]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret float [[MUL_LCSSA]]
 ;
 ; NO-VP-LABEL: @fmul(
@@ -1193,9 +1168,8 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP7]] = fmul reassoc float [[TMP6]], [[VEC_PHI]]
@@ -1263,8 +1237,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call fast float @llvm.vp.reduce.fmin.nxv4f32(float 0x47EFFFFFE0000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP14]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP14]], float [[VEC_PHI]]
@@ -1311,8 +1284,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP9]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP9]], float [[VEC_PHI]]
@@ -1379,8 +1351,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call fast float @llvm.vp.reduce.fmax.nxv4f32(float 0xC7EFFFFFE0000000, <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP14]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP14]], float [[VEC_PHI]]
@@ -1427,8 +1398,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt float [[TMP9]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP9]], float [[VEC_PHI]]
@@ -1477,38 +1447,35 @@ for.end:
 define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fminimum(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; IF-EVL-NEXT:    [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
-; IF-EVL-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT:    [[TMP3]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -1517,7 +1484,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP30:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
@@ -1538,9 +1505,8 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
 ; NO-VP-NEXT:    [[TMP6]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP7]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
@@ -1589,38 +1555,35 @@ for.end:
 define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fmaximum(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; IF-EVL-NEXT:    [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
-; IF-EVL-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT:    [[TMP3]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
 ; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -1629,7 +1592,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP32:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
@@ -1650,9 +1613,8 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
 ; NO-VP-NEXT:    [[TMP6]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP7]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
@@ -1719,11 +1681,9 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = fmul reassoc <vscale x 4 x float> [[VP_OP_LOAD]], [[VP_OP_LOAD1]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP18]] = fadd reassoc float [[TMP17]], [[VEC_PHI]]
@@ -1771,11 +1731,9 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi float [ [[START:%.*]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
 ; NO-VP-NEXT:    [[TMP11:%.*]] = fmul reassoc <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP11]])
 ; NO-VP-NEXT:    [[TMP13]] = fadd reassoc float [[TMP12]], [[VEC_PHI]]
@@ -1844,8 +1802,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
 ; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
@@ -1894,8 +1851,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; NO-VP-NEXT:    [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1964,8 +1920,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
 ; IF-EVL-NEXT:    [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
@@ -2014,8 +1969,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; NO-VP-NEXT:    [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index acfcf90..80b0714 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
 
 ; FIXME: interleaved accesses are not supported yet with predicated vectorization.
@@ -42,15 +40,14 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP23]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]]
-; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr align 4 [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
-; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; IF-EVL:       scalar.ph:
@@ -59,10 +56,10 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
-; IF-EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
-; IF-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
 ; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
@@ -94,8 +91,7 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; NO-VP-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
 ; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP18]]
 ; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP24]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP22]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-intermediate-store.ll
index 3e804c0..c7808b1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-intermediate-store.ll
@@ -1,24 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-OUTLOOP
 
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=IF-EVL-INLOOP
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-OUTLOOP
 
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefixes=NO-VP-INLOOP
 
 define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) {
@@ -52,8 +48,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]]
+; IF-EVL-OUTLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]]), !alias.scope [[META0:![0-9]+]]
 ; IF-EVL-OUTLOOP-NEXT:    [[VP_OP:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP12]])
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
@@ -111,8 +106,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP13]], i32 4, i1 true)
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META0:![0-9]+]]
+; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META0:![0-9]+]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP14]] to i64
@@ -168,8 +162,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; NO-VP-OUTLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-OUTLOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
-; NO-VP-OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META0:![0-9]+]]
+; NO-VP-OUTLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META0:![0-9]+]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP12]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-OUTLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; NO-VP-OUTLOOP-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -223,8 +216,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr)
 ; NO-VP-INLOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-INLOOP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; NO-VP-INLOOP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META0:![0-9]+]]
+; NO-VP-INLOOP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]]
 ; NO-VP-INLOOP-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
 ; NO-VP-INLOOP-NEXT:    [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]]
 ; NO-VP-INLOOP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll
index 3d91738..bda9145 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
 
 define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
@@ -29,11 +27,9 @@ define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP12]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[TMP10]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[IV_NEXT]], [[N_VEC]]
@@ -72,11 +68,9 @@ define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP6]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
 ; NO-VP-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll
index 258b7ce..2601bfd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
 
 ; TODO: We know the IV will never overflow here so we can skip the overflow
@@ -32,10 +31,9 @@ define void @trip_count_max_1024(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 [[UMAX]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; CHECK-NEXT:    [[VP_OP:%.*]] = add <vscale x 2 x i64> [[VP_OP_LOAD]], splat (i64 1)
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -103,10 +101,9 @@ define void @overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; CHECK-NEXT:    [[VP_OP:%.*]] = add <vscale x 2 x i64> [[VP_OP_LOAD]], splat (i64 1)
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -174,10 +171,9 @@ define void @no_overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 [[TC_ADD]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    [[VP_OP:%.*]] = add <vscale x 2 x i64> [[VP_OP_LOAD]], splat (i64 1)
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
index 63acbdf..226372f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
 
 define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
@@ -29,14 +27,12 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP17]], i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP19]], <vscale x 4 x i1> [[TMP17]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP17]], i32 [[TMP10]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP19]], <vscale x 4 x i1> [[TMP17]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP10]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -68,23 +64,52 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ;
 ; NO-VP-LABEL: @masked_loadstore(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP12]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT:    [[INC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
+; NO-VP-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]])
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[INC]]
+; NO-VP-NEXT:    br i1 [[TMP10]], label [[FOR_INC:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[INC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC]], [[FOR_INC]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY1:%.*]]
 ; NO-VP:       for.body:
-; NO-VP-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[I_11:%.*]] = phi i64 [ [[INC1:%.*]], [[FOR_INC1:%.*]] ], [ [[I_011]], [[SCALAR_PH]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_11]]
 ; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; NO-VP-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP0]], 0
-; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC1]]
 ; NO-VP:       if.then:
-; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_11]]
 ; NO-VP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
 ; NO-VP-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
 ; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
-; NO-VP-NEXT:    br label [[FOR_INC]]
+; NO-VP-NEXT:    br label [[FOR_INC1]]
 ; NO-VP:       for.inc:
-; NO-VP-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
-; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; NO-VP-NEXT:    [[INC1]] = add nuw nsw i64 [[I_11]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC1]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-no-masking.ll
index 1c49fba..6285a6f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-no-masking.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
 
 ; No need to emit predicated vector code if the vector instructions with masking are not required.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-ordered-reduction.ll
index a97c4b3..41bddaa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-ordered-reduction.ll
@@ -1,14 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
 ; RUN: -force-ordered-reductions=true -hints-allow-reordering=false \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
 
 define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
@@ -32,8 +30,7 @@ define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14]] = call float @llvm.vp.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
@@ -61,18 +58,45 @@ define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
 ;
 ; NO-VP-LABEL: @fadd(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N1:%.*]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N1]], [[TMP3]]
+; NO-VP-NEXT:    [[N:%.*]] = sub i64 [[N1]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
 ; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
-; NO-VP:       for.body:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; NO-VP-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; NO-VP-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-VP-NEXT:    [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ADD]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[SUM_07]], <vscale x 4 x float> [[WIDE_LOAD]])
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP5]]
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N1]], [[N]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; NO-VP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[ADD]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY1]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY1:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; NO-VP-NEXT:    [[SUM_7:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY1]] ]
+; NO-VP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT:    [[ADD1]] = fadd float [[TMP9]], [[SUM_7]]
+; NO-VP-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT1:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N1]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT1]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       for.end:
-; NO-VP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY1]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ]
 ; NO-VP-NEXT:    ret float [[ADD_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction-cost.ll
index 15f9993..10d83f4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction-cost.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize --disable-output \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck %s
 
 ; CHECK: Cost of 2 for VF vscale x 4: WIDEN-INTRINSIC vp<%{{.+}}> = call llvm.vp.merge(ir<true>, ir<%add>, ir<%rdx>, vp<%{{.+}}>)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll
index 8b14414..3a963b0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
 
 define i32 @add(ptr %a, i64 %n, i32 %start) {
@@ -31,8 +29,7 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -79,8 +76,7 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NO-VP-NEXT:    [[TMP10]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -127,37 +123,34 @@ for.end:
 define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-LABEL: @mul(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> splat (i32 1), i32 [[START:%.*]], i32 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ [[TMP9]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; IF-EVL-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ splat (i32 1), [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> poison)
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
 ; IF-EVL-NEXT:    [[TMP5]] = mul <8 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; IF-EVL-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP5]], <8 x i32> [[VEC_PHI]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT:    [[TMP4]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP4]], [[TMP5]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP6]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -166,7 +159,7 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
@@ -186,9 +179,8 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ splat (i32 1), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 8
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; NO-VP-NEXT:    [[TMP5]] = mul <8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP6]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -256,8 +248,7 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = or <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -304,8 +295,7 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NO-VP-NEXT:    [[TMP10]] = or <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -370,8 +360,7 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = and <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -418,8 +407,7 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NO-VP-NEXT:    [[TMP10]] = and <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -484,8 +472,7 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = xor <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -532,8 +519,7 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; NO-VP-NEXT:    [[TMP10]] = xor <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -599,8 +585,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
@@ -650,8 +635,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -720,8 +704,7 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
@@ -771,8 +754,7 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -841,8 +823,7 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp ult <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
@@ -892,8 +873,7 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -962,8 +942,7 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 4 x i32> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
@@ -1013,8 +992,7 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1082,8 +1060,7 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = fadd reassoc <vscale x 4 x float> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14]] = call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[VP_OP]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP10]] to i64
@@ -1130,8 +1107,7 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; NO-VP-NEXT:    [[TMP10]] = fadd reassoc <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1178,37 +1154,34 @@ for.end:
 define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fmul(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = insertelement <8 x float> splat (float 1.000000e+00), float [[START:%.*]], i32 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[TMP9]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; IF-EVL-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float 1.000000e+00), [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> poison)
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 8
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; IF-EVL-NEXT:    [[TMP5]] = fmul reassoc <8 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; IF-EVL-NEXT:    [[TMP6:%.*]] = select reassoc <8 x i1> [[TMP2]], <8 x float> [[TMP5]], <8 x float> [[VEC_PHI]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT:    [[TMP4]] = fmul reassoc <8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP6:%.*]] = fmul reassoc <8 x float> [[TMP4]], [[TMP5]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[TMP6]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -1217,7 +1190,7 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP24:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
@@ -1237,9 +1210,8 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float 1.000000e+00), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
 ; NO-VP-NEXT:    [[TMP5]] = fmul reassoc <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP6]] = fmul reassoc <8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -1308,8 +1280,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP9]])
@@ -1359,8 +1330,7 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1429,8 +1399,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = fcmp fast ogt <vscale x 4 x float> [[VP_OP_LOAD]], [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[VEC_PHI]]
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP9]])
@@ -1480,8 +1449,7 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = fcmp fast ogt <vscale x 4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; NO-VP-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[VEC_PHI]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1530,38 +1498,35 @@ for.end:
 define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fminimum(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; IF-EVL-NEXT:    [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
-; IF-EVL-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT:    [[TMP3]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -1570,7 +1535,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP30:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
@@ -1591,9 +1556,8 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
 ; NO-VP-NEXT:    [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP5]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
@@ -1642,38 +1606,35 @@ for.end:
 define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-LABEL: @fmaximum(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16
+; IF-EVL-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; IF-EVL-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison)
+; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
+; IF-EVL-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; IF-EVL-NEXT:    [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]])
-; IF-EVL-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 8
+; IF-EVL-NEXT:    [[TMP3]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
 ; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ]
-; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
@@ -1682,7 +1643,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
 ; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP32:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
@@ -1703,9 +1664,8 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
 ; NO-VP-NEXT:    [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
 ; NO-VP-NEXT:    [[TMP5]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
@@ -1773,11 +1733,9 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = call reassoc <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> [[VP_OP_LOAD1]], <vscale x 4 x float> [[VEC_PHI]])
 ; IF-EVL-NEXT:    [[TMP17]] = call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> [[TMP16]], <vscale x 4 x float> [[VEC_PHI]], i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP10]] to i64
@@ -1826,11 +1784,9 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
 ; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4
 ; NO-VP-NEXT:    [[TMP12]] = call reassoc <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[WIDE_LOAD1]], <vscale x 4 x float> [[VEC_PHI]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1898,8 +1854,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
@@ -1948,8 +1903,7 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; NO-VP-NEXT:    [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -2018,8 +1972,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
 ; IF-EVL-NEXT:    [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
@@ -2068,8 +2021,7 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; NO-VP-NEXT:    [[TMP10]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP9]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
index 91d94e5..62eef6c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
 
 define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
@@ -75,18 +73,61 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ;
 ; NO-VP-LABEL: @reverse_load_store(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], -1
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i64 0, [[TMP5]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP5]], 1
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul i64 -1, [[TMP11]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP12]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-VP-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]])
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = mul i64 0, [[TMP5]]
+; NO-VP-NEXT:    [[TMP17:%.*]] = sub i64 [[TMP5]], 1
+; NO-VP-NEXT:    [[TMP18:%.*]] = mul i64 -1, [[TMP17]]
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP16]]
+; NO-VP-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP18]]
+; NO-VP-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE]])
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[REVERSE1]], ptr [[TMP20]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY1:%.*]]
 ; NO-VP:       for.body:
-; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY1]] ]
+; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY1]] ]
 ; NO-VP-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
 ; NO-VP-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
-; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
 ; NO-VP-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
 ; NO-VP-NEXT:    [[INC]] = add i32 [[I]], 1
 ; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY1]], label [[LOOPEND]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       loopend:
 ; NO-VP-NEXT:    ret void
 ;
@@ -132,8 +173,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
@@ -189,25 +229,74 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ;
 ; NO-VP-LABEL: @reverse_load_store_masked(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[ENTRY:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = sub i64 [[STARTVAL1:%.*]], [[N_VEC]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL1]], [[INDEX]]
+; NO-VP-NEXT:    [[OFFSET_IDX1:%.*]] = trunc i64 [[INDEX]] to i32
+; NO-VP-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], -1
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX1]]
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 100)
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP8]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul i64 0, [[TMP5]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP5]], 1
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul i64 -1, [[TMP13]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP11]], i64 [[TMP12]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP14]]
+; NO-VP-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
+; NO-VP-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP5]]
+; NO-VP-NEXT:    [[TMP19:%.*]] = sub i64 [[TMP5]], 1
+; NO-VP-NEXT:    [[TMP20:%.*]] = mul i64 -1, [[TMP19]]
+; NO-VP-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP18]]
+; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP20]]
+; NO-VP-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-VP-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE2]])
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[REVERSE3]])
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[ENTRY]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[STARTVAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL1]], [[ENTRY1:%.*]] ]
+; NO-VP-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY1:%.*]]
 ; NO-VP:       for.body:
-; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
-; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[ENTRY]] ], [ [[INC:%.*]], [[FOR_INC]] ]
 ; NO-VP-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[I]]
+; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[I]]
 ; NO-VP-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
 ; NO-VP-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP]], 100
 ; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; NO-VP:       if.then:
-; NO-VP-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1]], i64 [[ADD]]
 ; NO-VP-NEXT:    [[V:%.*]] = load i32, ptr [[GEPL1]], align 4
-; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
 ; NO-VP-NEXT:    store i32 [[V]], ptr [[GEPS]], align 4
 ; NO-VP-NEXT:    br label [[FOR_INC]]
 ; NO-VP:       for.inc:
 ; NO-VP-NEXT:    [[INC]] = add i32 [[I]], 1
 ; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY1]], label [[LOOPEND]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       loopend:
 ; NO-VP-NEXT:    ret void
 ;
@@ -319,20 +408,49 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ;
 ; NO-VP-LABEL: @multiple_reverse_vector_pointer(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
+; NO-VP-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1024, [[INDEX]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 -15
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; NO-VP-NEXT:    [[REVERSE:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], <16 x i8> [[REVERSE]]
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> [[TMP3]], i32 1, <16 x i1> splat (i1 true), <16 x i8> poison)
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 -15
+; NO-VP-NEXT:    [[REVERSE1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_GATHER]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-VP-NEXT:    store <16 x i8> [[REVERSE1]], ptr [[TMP6]], align 1
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[OFFSET_IDX]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 -15
+; NO-VP-NEXT:    store <16 x i8> [[REVERSE1]], ptr [[TMP9]], align 1
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; NO-VP-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP1:%.*]]
 ; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; NO-VP-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ]
+; NO-VP-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
 ; NO-VP-NEXT:    [[X:%.*]] = load i8, ptr [[GEP_A]], align 1
-; NO-VP-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B:%.*]], i8 [[X]]
+; NO-VP-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i8 [[X]]
 ; NO-VP-NEXT:    [[Y:%.*]] = load i8, ptr [[GEP_B]], align 1
-; NO-VP-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[C]], i64 [[IV]]
 ; NO-VP-NEXT:    store i8 [[Y]], ptr [[GEP_C]], align 1
-; NO-VP-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[IV]]
+; NO-VP-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV]]
 ; NO-VP-NEXT:    store i8 [[Y]], ptr [[GEP_D]], align 1
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
 ; NO-VP-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; NO-VP-NEXT:    br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; NO-VP-NEXT:    br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll
index 336c242..a891eef 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
 
 ; Dependence distance between read and write is greater than the trip
@@ -32,12 +30,10 @@ define void @test(ptr %p) {
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 200
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -63,17 +59,44 @@ define void @test(ptr %p) {
 ;
 ; NO-VP-LABEL: @test(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
-; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
-; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 8
-; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[A1]], align 8
+; NO-VP-NEXT:    [[TMP7:%.*]] = add i64 [[IV]], 200
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP8]], align 8
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP1:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP1]] ]
+; NO-VP-NEXT:    [[A3:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV1]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A3]], align 8
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV1]], 200
 ; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
 ; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 8
-; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV1]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -106,19 +129,17 @@ define void @test_may_clobber1(ptr %p) {
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 100
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; IF-EVL-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-NEXT:    br label [[EXIT:%.*]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -141,19 +162,17 @@ define void @test_may_clobber1(ptr %p) {
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32
 ; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 100
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; NO-VP-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; NO-VP-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NO-VP:       middle.block:
-; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP-NEXT:    br label [[EXIT:%.*]]
 ; NO-VP:       scalar.ph:
-; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
 ; NO-VP:       loop:
 ; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -164,7 +183,7 @@ define void @test_may_clobber1(ptr %p) {
 ; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -246,19 +265,17 @@ define void @test_may_clobber3(ptr %p) {
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 32
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 10
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; IF-EVL-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL-NEXT:    br label [[EXIT:%.*]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -281,19 +298,17 @@ define void @test_may_clobber3(ptr %p) {
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 32
 ; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 10
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; NO-VP-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
+; NO-VP-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NO-VP:       middle.block:
-; NO-VP-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP-NEXT:    br label [[EXIT:%.*]]
 ; NO-VP:       scalar.ph:
-; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
 ; NO-VP:       loop:
 ; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -304,7 +319,7 @@ define void @test_may_clobber3(ptr %p) {
 ; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -347,12 +362,10 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[EVL_BASED_IV]], 8192
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP10]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP11]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP6]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -378,17 +391,44 @@ define void @trivial_due_max_vscale(ptr %p) {
 ;
 ; NO-VP-LABEL: @trivial_due_max_vscale(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
-; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
-; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[TMP7:%.*]] = add i64 [[IV]], 8192
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]]
+; NO-VP-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP8]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP1:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP1]] ]
+; NO-VP-NEXT:    [[A3:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV1]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A3]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV1]], 8192
 ; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
 ; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV1]], 199
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -431,12 +471,10 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; IF-EVL-NEXT:    [[SAFE_AVL:%.*]] = select i1 [[TMP9]], i64 [[AVL]], i64 1024
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[SAFE_AVL]], i32 1, i1 true)
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP3]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr align 32 [[TMP2]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = add i64 [[EVL_BASED_IV]], 1024
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
-; IF-EVL-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP6]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP10]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP5]], <vscale x 1 x i1> splat (i1 true), i32 [[TMP10]])
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -462,17 +500,34 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ;
 ; NO-VP-LABEL: @no_high_lmul_or_interleave(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; NO-VP:       vector.ph:
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
-; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
-; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[A1]], align 32
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[IV]], 1024
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP1]]
+; NO-VP-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP2]], align 32
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 4
+; NO-VP-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[IV_NEXT]], 3000
+; NO-VP-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; NO-VP-NEXT:    br label [[LOOP1:%.*]]
+; NO-VP:       loop:
+; NO-VP-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP1]] ]
+; NO-VP-NEXT:    [[A3:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV1]]
+; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A3]], align 32
+; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV1]], 1024
 ; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
 ; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; NO-VP-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
+; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV1]], 3001
+; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP1]], label [[EXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
index 984b64c..8ab3817 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-dont-vectorize --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -force-tail-folding-style=data-with-evl -S | FileCheck %s
+; RUN: opt < %s --prefer-predicate-over-epilogue=predicate-else-scalar-epilogue --passes=loop-vectorize -mcpu=sifive-p470 -mattr=+v,+f -S | FileCheck %s
 ; Generated from issue #109468.
 ; In this test case, the vector store with tail mask will transfer to the vp intrinsic with EVL.
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index b56e712..79ec73a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -26,13 +26,11 @@ define void @test_pr98413_zext_removed(ptr %src, ptr noalias %dst, i64 %x) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = trunc <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x i8>
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <vscale x 8 x i8> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP11]], ptr [[TMP13]], align 1
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP11]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -99,13 +97,11 @@ define void @test_pr98413_sext_removed(ptr %src, ptr noalias %dst, i64 %x) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = trunc <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x i8>
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <vscale x 8 x i8> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP11]], ptr [[TMP13]], align 1
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP11]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
index 72afff2..4da31a0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
 
 ; Make sure we don't crash when transforming a VPWidenCastRecipe created without
 ; an underlying value to an EVL recipe. This occurs in this test via
@@ -25,14 +25,7 @@ define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) {
 ; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 9, [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP6]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <vscale x 8 x i8> [[VP_OP_LOAD]] to <vscale x 8 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 8 x i16> zeroinitializer, [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = lshr <vscale x 8 x i16> [[TMP12]], splat (i16 1)
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x i8>
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP14]], <vscale x 8 x ptr> align 1 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x ptr> align 1 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
index 1e1ed49..bb2e099 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
 
 ; This test tries to recreate the conditions for a crash that occurred when the
 ; VPTypeAnalysis cache wasn't cleared after a recipe was erased and clobbered
@@ -37,18 +37,15 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count)
 ; CHECK-NEXT:    [[AVL:%.*]] = sub i64 [[TMP0]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[EVL_BASED_IV]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP14]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP13]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[VP_OP_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[VP_OP:%.*]] = mul <vscale x 8 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = ashr <vscale x 8 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[VP_OP3:%.*]] = or <vscale x 8 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult <vscale x 8 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i32> [[VP_OP3]], <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = trunc <vscale x 8 x i32> [[TMP17]] to <vscale x 8 x i8>
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP24]], <vscale x 8 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc <vscale x 8 x i32> [[VP_OP]] to <vscale x 8 x i16>
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> [[TMP19]], <vscale x 8 x ptr> align 2 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index c6955f1..d3c2ccf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=SCALABLE
 ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=off -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=FIXEDLEN
-; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-SCALABLE
+; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=0 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=TF-SCALABLE
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
@@ -28,8 +28,7 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP7]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -62,15 +61,14 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
 ; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -81,7 +79,7 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
 ; FIXEDLEN-NEXT:    ret void
 ;
@@ -108,10 +106,9 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
-; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP6]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -169,8 +166,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -204,15 +200,14 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
 ; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -223,25 +218,52 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
-; FIXEDLEN-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
+; FIXEDLEN-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ]
 ; FIXEDLEN-NEXT:    ret i64 [[V_LCSSA]]
 ;
 ; TF-SCALABLE-LABEL: define i64 @uniform_load_outside_use(
 ; TF-SCALABLE-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; TF-SCALABLE-NEXT:  [[ENTRY:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
+; TF-SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; TF-SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; TF-SCALABLE:       [[VECTOR_PH]]:
+; TF-SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; TF-SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 2
+; TF-SCALABLE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; TF-SCALABLE:       [[VECTOR_BODY]]:
+; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
+; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; TF-SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
+; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP5]]
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
+; TF-SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; TF-SCALABLE:       [[SCALAR_PH]]:
+; TF-SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
+; TF-SCALABLE:       [[FOR_BODY]]:
+; TF-SCALABLE-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[V1:%.*]] = load i64, ptr [[B]], align 8
+; TF-SCALABLE-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
+; TF-SCALABLE-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
+; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
-; TF-SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ]
+; TF-SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
 ; TF-SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
 ;
 entry:
@@ -292,8 +314,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i64> poison)
 ; SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; SCALABLE-NEXT:    store <vscale x 4 x i64> [[PREDPHI]], ptr [[TMP13]], align 8
+; SCALABLE-NEXT:    store <vscale x 4 x i64> [[PREDPHI]], ptr [[TMP12]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -340,16 +361,15 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
 ; FIXEDLEN-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER1]], <4 x i64> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -366,7 +386,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; FIXEDLEN-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
 ; FIXEDLEN-NEXT:    ret void
 ;
@@ -405,14 +425,13 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.vp.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP10]], i32 [[TMP7]])
 ; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr align 8 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv4i64.p0(<vscale x 4 x i64> [[PREDPHI]], ptr align 8 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
 ; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP7]] to i64
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -431,7 +450,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -480,8 +499,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -514,15 +532,14 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; FIXEDLEN-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
 ; FIXEDLEN-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -533,7 +550,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
 ; FIXEDLEN-NEXT:    ret void
 ;
@@ -560,13 +577,12 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
-; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP6]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
+; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -579,7 +595,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -621,8 +637,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -655,15 +670,14 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B]], align 8
 ; FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -674,7 +688,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
 ; FIXEDLEN-NEXT:    ret void
 ;
@@ -701,13 +715,12 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP5]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP6]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -720,7 +733,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -774,8 +787,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i64> [[TMP9]], i32 [[TMP14]]
 ; SCALABLE-NEXT:    store i64 [[TMP15]], ptr [[B]], align 8
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP17]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP16]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -812,15 +824,14 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 7
 ; FIXEDLEN-NEXT:    store i64 [[TMP4]], ptr [[B]], align 8
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -831,7 +842,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
 ; FIXEDLEN-NEXT:    ret void
 ;
@@ -868,14 +879,13 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr align 8 [[TMP11]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT3]], ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; TF-SCALABLE-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -888,7 +898,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -940,8 +950,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], splat (i64 10)
 ; SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP10]])
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr [[TMP13]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr [[TMP12]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -987,16 +996,15 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]])
 ; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP2]])
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -1012,7 +1020,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
 ; FIXEDLEN-NEXT:    ret void
 ;
@@ -1050,14 +1058,13 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], splat (i64 10)
 ; TF-SCALABLE-NEXT:    call void @llvm.vp.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT2]], <vscale x 2 x i1> [[TMP10]], i32 [[TMP9]])
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT1]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP9]])
+; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -1075,7 +1082,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -1123,8 +1130,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 1
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1157,15 +1163,14 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[B]], align 1
 ; FIXEDLEN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; FIXEDLEN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
-; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
+; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
 ; FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FIXEDLEN-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXEDLEN-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; FIXEDLEN:       [[MIDDLE_BLOCK]]:
-; FIXEDLEN-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; FIXEDLEN-NEXT:    br label %[[SCALAR_PH]]
 ; FIXEDLEN:       [[SCALAR_PH]]:
 ; FIXEDLEN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; FIXEDLEN-NEXT:    br label %[[FOR_BODY:.*]]
@@ -1176,7 +1181,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; FIXEDLEN-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; FIXEDLEN-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXEDLEN-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; FIXEDLEN-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; FIXEDLEN:       [[FOR_END]]:
 ; FIXEDLEN-NEXT:    ret void
 ;
@@ -1203,13 +1208,12 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 1
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP5]] to i64
-; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[INDEX]]
+; TF-SCALABLE-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr align 8 [[TMP6]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
+; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP4]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[SCALAR_PH]]:
@@ -1222,7 +1226,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
index 7b91f10..7537a43 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -1,12 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
 
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
@@ -29,15 +27,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[VP_OP:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
@@ -80,15 +75,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
 ; NO-VP-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP12]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
index e7fdfbc..bda9839 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
@@ -31,9 +31,9 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
index 9652351..7f1066c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
@@ -1,8 +1,7 @@
 ; REQUIRES: asserts
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
index 3ec48ef..c1b656a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cast-intrinsics.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 define void @vp_sext(ptr %a, ptr %b, i64 %N) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
index 7f29213..9900602 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll
@@ -1,8 +1,7 @@
 ; REQUIRES: asserts
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index baf546b..1c9554d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -1,25 +1,21 @@
 ; REQUIRES: asserts
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-OUTLOOP,IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL-INLOOP,IF-EVL %s
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP-OUTLOOP %s
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
 ; RUN: -prefer-inloop-reductions \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP-INLOOP %s
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index 97a6130..42a846a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -1,13 +1,11 @@
 ; REQUIRES: asserts
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -force-tail-folding-style=data-with-evl \
-; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s
 
 ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
-; RUN: -force-tail-folding-style=none \
-; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s
 
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
index c5e4ecc..722786f 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -25,7 +25,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -76,7 +76,7 @@ define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index 61bcbaa..a91bc65 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -34,8 +34,7 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP14]], i8 [[TMP11]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [8 x i32], ptr @src, i64 0, i64 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
@@ -66,9 +65,9 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 86b28c3..c61b1b9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -19,13 +19,12 @@ define void @f1() {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 0 to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[BB1:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[BB1:%.*]] ]
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 0cbf6eb..9506ad3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -26,8 +26,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <32 x i8> [[VEC_IND]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <32 x i8> [[VEC_IND]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i8> [[VEC_IND]], splat (i8 32)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -56,8 +55,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[VEC_IND8:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX10:%.*]] = add i64 3, [[INDEX7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_IDX10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[VEC_IND8]], ptr [[TMP11]], align 1
+; CHECK-NEXT:    store <4 x i8> [[VEC_IND8]], ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
@@ -129,11 +127,10 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[TMP19:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float>
 ; CHECK-NEXT:    [[TMP20:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 6
-; CHECK-NEXT:    store <2 x float> [[TMP12]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP12]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    store <2 x float> [[TMP18]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    store <2 x float> [[TMP19]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[TMP17]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
index 0c5db43..04e0daf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
@@ -25,7 +25,7 @@ define i64 @test_foldable_live_in_via_scev() {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul <2 x i64> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
@@ -36,9 +36,9 @@ define i64 @test_foldable_live_in_via_scev() {
 ; CHECK-NEXT:    [[MUL]] = mul nsw i64 [[RED]], [[ADD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RET:%.*]] = phi i64 [ [[MUL]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i64 [ [[MUL]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[RET]]
 ;
 entry:
@@ -95,10 +95,10 @@ define i64 @second_lshr_operand_zero_via_scev() {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOPS:.*]]
 ; CHECK:       [[LOOPS]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOPS]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 147e949..85b475c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -982,7 +982,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <4 x i32> [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -1001,7 +1001,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    store i32 [[RED_NEXT]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 29
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1045,8 +1045,8 @@ define i64 @live_in_known_1_via_scev() {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 3, i64 1, i64 1, i64 1>, [[VECTOR_PH]] ], [ [[VEC_PHI]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[VEC_PHI]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -1110,10 +1110,10 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) {
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]])
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 1, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT_I_I_I:%.*]], [[LOOP]] ]
@@ -1169,10 +1169,10 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i1 [[TMP20]] to i32
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 17, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INC:%.*]], [[LOOP1]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
index fd53a4c..d49aca9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
@@ -37,8 +37,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP27]], i32 8, <4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP26]], i32 8, <4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -151,8 +150,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP25]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP29]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP28]], i32 8, <4 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 49d33d3..84e36cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -33,12 +33,10 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison), !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison), !invariant.load [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
@@ -85,12 +83,10 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nusw float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nusw float, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -137,12 +133,10 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nuw float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nuw float, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -190,12 +184,10 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[TMP3]], <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[TMP3]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -246,8 +238,7 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -295,16 +286,13 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[PTRS]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[INPUT]], <4 x i64> [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[TMP5]], <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP5]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -369,15 +357,12 @@ define void @drop_nonvector_nuw_nsw_avx1(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x ptr> [[TMP14]], ptr [[TMP11]], i32 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x ptr> [[TMP16]], ptr [[TMP13]], i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP17]], ptr [[TMP18]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP17]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP20]], i32 4, <4 x i1> [[TMP19]], <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP10]], i32 4, <4 x i1> [[TMP19]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -427,8 +412,7 @@ define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> zeroinitializer, <4 x i64> [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -476,12 +460,10 @@ define void @drop_scalar_exact(ptr noalias nocapture readonly %input, ptr %outpu
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -530,8 +512,7 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP4]], i32 8, <4 x i1> [[TMP0]], <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP3]], i32 8, <4 x i1> [[TMP0]], <4 x double> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    store double [[TMP5]], ptr [[P1]], align 8
@@ -589,8 +570,7 @@ define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
@@ -641,8 +621,7 @@ define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> zeroinitializer, <4 x i64> [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
@@ -789,8 +768,7 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = phi <4 x i8> [ [[TMP22]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP28]], %[[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
@@ -889,8 +867,7 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[PREDPHI7]], i32 3
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[AUX]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
@@ -940,16 +917,13 @@ define void @Bgep_inbounds_unconditionally_due_to_store(ptr noalias %B, ptr read
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 20)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD1]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> splat (float 3.300000e+01), <4 x float> [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 523f284..7c74981 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -44,8 +44,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = trunc <16 x i32> [[TMP14]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[ARR:%.*]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    store <16 x i16> [[TMP15]], ptr [[TMP18]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP15]], ptr [[TMP17]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -80,8 +79,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = trunc <4 x i32> [[TMP25]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP23]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr [[ARR]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i16, ptr [[TMP28]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[TMP26]], ptr [[TMP29]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP26]], ptr [[TMP28]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT13]] = add <4 x i32> [[VEC_IND12]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC4]]
@@ -166,11 +164,10 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[STEP_ADD_2]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[STEP_ADD_3]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 16
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 48
-; CHECK-NEXT:    store <16 x i16> [[TMP4]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    store <16 x i16> [[TMP4]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP5]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[TMP21]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP12]], align 2
@@ -211,8 +208,7 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[VEC_IND20:%.*]] = phi <8 x i16> [ [[INDUCTION17]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub <8 x i16> [[VEC_IND20]], [[DOTSPLAT14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[INDEX12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <8 x i16> [[TMP17]], ptr [[TMP19]], align 2
+; CHECK-NEXT:    store <8 x i16> [[TMP17]], ptr [[TMP18]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX12]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT21]] = add <8 x i16> [[VEC_IND20]], [[BROADCAST_SPLAT23]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC5]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 86fa5d6..fe2ad66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -26,18 +26,16 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i8> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD1]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[WIDE_LOAD1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
-; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP11]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -121,9 +119,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <16 x i8> [ [[VECTOR_RECUR_INIT3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD5]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
 ; CHECK-NEXT:    [[TMP8]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD5]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -138,9 +135,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP17:%.*]] = add <16 x i8> [[TMP15]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <16 x i8> [[TMP16]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 16
-; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP19]], align 1
 ; CHECK-NEXT:    store <16 x i8> [[TMP18]], ptr [[TMP22]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -313,7 +309,7 @@ define void @for_iv_trunc_optimized(ptr %dst) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 337, [[MIDDLE_BLOCK]] ], [ 1, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[BB]] ]
@@ -328,7 +324,7 @@ define void @for_iv_trunc_optimized(ptr %dst) {
 ; CHECK-NEXT:    store i32 [[FOR_2]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[IV]], 337
 ; CHECK-NEXT:    [[TRUNC]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
index 1b359f7..21fc8e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll
@@ -22,29 +22,26 @@ define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[WIDE_LOAD5]], <4 x float> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 4
-; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -103,29 +100,26 @@ define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[WIDE_LOAD5]], <4 x float> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i32 4
-; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -184,29 +178,26 @@ define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[WIDE_LOAD5]], <2 x double> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 2
-; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -265,29 +256,26 @@ define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw double, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw double, ptr [[TMP5]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[WIDE_LOAD5]], <2 x double> [[WIDE_LOAD7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw double, ptr [[TMP10]], i32 2
-; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -346,22 +334,19 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x half> [[TMP6]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store <8 x half> [[TMP6]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
@@ -420,22 +405,19 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw half, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw half, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x half>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[WIDE_LOAD]], <8 x half> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw half, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x half> [[TMP6]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store <8 x half> [[TMP6]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index c4fc609..07ff8e2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -28,19 +28,16 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDEX1]]
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4
 ; AVX512-NEXT:    [[TMP3:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[INDEX1]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP6:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <16 x i64> [[TMP6]]
 ; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP7]], i32 4, <16 x i1> [[TMP3]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[INDEX1]]
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr [[TMP10]], i32 4, <16 x i1> [[TMP3]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr [[TMP9]], i32 4, <16 x i1> [[TMP3]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX512-NEXT:    br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -53,19 +50,16 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2:       vector.body:
 ; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDEX1]]
-; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; FVW2-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; FVW2-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[INDEX1]]
-; FVW2-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP5]], i32 4, <2 x i1> [[TMP3]], <2 x i32> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[TMP3]], <2 x i32> poison)
 ; FVW2-NEXT:    [[TMP6:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
 ; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <2 x i64> [[TMP6]]
 ; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP7]], i32 4, <2 x i1> [[TMP3]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[INDEX1]]
-; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr [[TMP10]], i32 4, <2 x i1> [[TMP3]])
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr [[TMP9]], i32 4, <2 x i1> [[TMP3]])
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
 ; FVW2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -647,11 +641,9 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX]]
 ; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP18]], align 4, !alias.scope [[META8:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP17]], align 4, !alias.scope [[META8:![0-9]+]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP14]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
-; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x float>, ptr [[TMP19]], align 4, !alias.scope [[META15:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x float>, ptr [[TMP16]], align 4, !alias.scope [[META15:![0-9]+]]
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP14]], i64 1
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD8]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -686,17 +678,15 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[OFFSET_IDX21:%.*]] = mul i64 [[INDEX18]], 4
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX21]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
-; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x float>, ptr [[TMP29]], align 4, !alias.scope [[META8]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD16]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP28]], align 4, !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
-; AVX512-NEXT:    br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
 ; AVX512-NEXT:    br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -716,7 +706,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
 ; AVX512-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
 ; AVX512-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
-; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -775,14 +765,12 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; FVW2-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP17]]
 ; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP18]]
 ; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
-; FVW2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
-; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP22]], align 4, !alias.scope [[META8:![0-9]+]]
+; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP21]], align 4, !alias.scope [[META8:![0-9]+]]
 ; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
 ; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
 ; FVW2-NEXT:    store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]]
-; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
-; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP25]], align 4, !alias.scope [[META15:![0-9]+]]
+; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP16]], align 4, !alias.scope [[META15:![0-9]+]]
 ; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
 ; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
 ; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
index 929a2e5..6938ffb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
@@ -12,17 +12,15 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP8]], i32 2, <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -42,7 +40,7 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 99
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -86,19 +84,16 @@ define void @gep_use_outside_loop(ptr noalias %dst, ptr %src) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP7]], i32 2, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP6]], i32 2, <4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -115,9 +110,9 @@ define void @gep_use_outside_loop(ptr noalias %dst, ptr %src) {
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 99
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[GEP_DST_LCSSA:%.*]] = phi ptr [ [[GEP_DST]], %[[LOOP_LATCH]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[GEP_DST_LCSSA:%.*]] = phi ptr [ [[GEP_DST]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    store i16 0, ptr [[GEP_DST_LCSSA]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 4ed5489..9168ebf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -74,8 +74,7 @@ define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[ADD_US]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP18]], ptr [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access [[META0]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index a62e8ff..7fe4c14 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -75,9 +75,8 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; SSE-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; SSE-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ]
 ; SSE-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[INDEX]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[TMP2]], i32 0
 ; SSE-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP2]], i32 2
-; SSE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
+; SSE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; SSE-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
 ; SSE-NEXT:    [[TMP6:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], splat (double 4.200000e+01)
 ; SSE-NEXT:    [[TMP7:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD2]], splat (double 4.200000e+01)
@@ -91,10 +90,10 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; SSE:       middle.block:
 ; SSE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI3]], [[PREDPHI]]
 ; SSE-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]])
-; SSE-NEXT:    br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]]
+; SSE-NEXT:    br label [[DONE:%.*]]
 ; SSE:       scalar.ph:
-; SSE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SSE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; SSE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
+; SSE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ]
 ; SSE-NEXT:    br label [[LOOP:%.*]]
 ; SSE:       loop:
 ; SSE-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
@@ -129,11 +128,10 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; AVX-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI8:%.*]], [[VECTOR_BODY]] ]
 ; AVX-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI9:%.*]], [[VECTOR_BODY]] ]
 ; AVX-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[ARR:%.*]], i32 [[INDEX]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[TMP4]], i32 0
 ; AVX-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP4]], i32 4
 ; AVX-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[TMP4]], i32 8
 ; AVX-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[TMP4]], i32 12
-; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP8]], align 8
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
 ; AVX-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x double>, ptr [[TMP9]], align 8
 ; AVX-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x double>, ptr [[TMP10]], align 8
 ; AVX-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
@@ -157,10 +155,10 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; AVX-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <4 x double> [[PREDPHI8]], [[BIN_RDX]]
 ; AVX-NEXT:    [[BIN_RDX11:%.*]] = fadd fast <4 x double> [[PREDPHI9]], [[BIN_RDX10]]
 ; AVX-NEXT:    [[TMP21:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[BIN_RDX11]])
-; AVX-NEXT:    br i1 true, label [[DONE:%.*]], label [[SCALAR_PH]]
+; AVX-NEXT:    br label [[DONE:%.*]]
 ; AVX:       scalar.ph:
-; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
+; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ]
 ; AVX-NEXT:    br label [[LOOP:%.*]]
 ; AVX:       loop:
 ; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index d445e0d..fcd94f4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -126,14 +126,12 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[C]], <4 x i16> [[VEC_IND]], <4 x i16> splat (i16 10)
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C]], <4 x i16> [[STEP_ADD]], <4 x i16> splat (i16 10)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP4]], i32 4
-; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[TMP6]], align 2, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]]
+; CHECK-NEXT:    store <4 x i16> [[TMP1]], ptr [[TMP4]], align 2, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]]
 ; CHECK-NEXT:    store <4 x i16> [[TMP2]], ptr [[TMP7]], align 2, !alias.scope [[META5]], !noalias [[META8]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND3]], ptr [[TMP10]], align 4, !alias.scope [[META8]]
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND3]], ptr [[TMP8]], align 4, !alias.scope [[META8]]
 ; CHECK-NEXT:    store <4 x i32> [[STEP_ADD4]], ptr [[TMP11]], align 4, !alias.scope [[META8]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
@@ -141,7 +139,7 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -156,7 +154,7 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) {
 ; CHECK-NEXT:    store i32 [[IV_32]], ptr [[GEP_B]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 64
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -195,16 +193,15 @@ define void @truncated_ivs_with_wide_and_scalar_uses(i1 %c, ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[DST]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[C]], <8 x i16> [[VEC_IND]], <8 x i16> splat (i16 10)
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[C]], <8 x i16> [[STEP_ADD]], <8 x i16> splat (i16 10)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[TMP3]], i32 8
-; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP3]], align 2
 ; CHECK-NEXT:    store <8 x i16> [[TMP6]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[STEP_ADD]], splat (i16 8)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -217,7 +214,7 @@ define void @truncated_ivs_with_wide_and_scalar_uses(i1 %c, ptr %dst) {
 ; CHECK-NEXT:    store i16 [[SEL]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 64
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -293,10 +290,9 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    [[NEXT_GEP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[NEXT_GEP21:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[NEXT_GEP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP23]], align 1, !alias.scope [[META14:![0-9]+]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1, !alias.scope [[META14]]
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[NEXT_GEP23]], align 1, !alias.scope [[META14]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP22]] = add <16 x i32> [[TMP19]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP22]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -339,7 +335,7 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP22]], i32 15
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -12, [[MIDDLE_BLOCK]] ], [ 100, [[ENTRY:%.*]] ], [ 100, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[ENTRY]] ], [ 2048, [[VECTOR_MEMCHECK]] ]
@@ -366,7 +362,7 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    [[DEC]] = add i32 [[IV_1]], 1
 ; CHECK-NEXT:    [[OUTPTR_0]] = getelementptr i8, ptr [[PTR_IV_1]], i64 2
 ; CHECK-NEXT:    [[CMP30_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP30_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP30_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -416,12 +412,11 @@ define i16 @iv_and_step_trunc() {
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -485,11 +480,10 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc <8 x i64> [[TMP13]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc <8 x i64> [[TMP14]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc <8 x i64> [[TMP15]] to <8 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP16]], i32 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP16]], i32 16
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP16]], i32 24
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP24]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP16]], i32 4, <8 x i1> [[TMP8]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP9]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP10]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP11]])
@@ -518,8 +512,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP35]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP33]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP33]])
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100
@@ -715,14 +708,13 @@ define void @wombat(i32 %arg, ptr %dst) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 4, [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[VEC_IND]], splat (i32 12)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[MUL]], [[ENTRY]] ]
@@ -737,7 +729,7 @@ define void @wombat(i32 %arg, ptr %dst) #1 {
 ; CHECK-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65
 ; CHECK-NEXT:    [[TRUNC]] = trunc i64 [[MUL3]] to i32
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -788,14 +780,13 @@ define void @wombat2(i32 %arg, ptr %dst) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 4, [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[VEC_IND]], splat (i32 12)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[MUL]], [[ENTRY]] ]
@@ -811,7 +802,7 @@ define void @wombat2(i32 %arg, ptr %dst) #1 {
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65
 ; CHECK-NEXT:    [[TRUNC_0:%.*]] = trunc i64 [[MUL3]] to i60
 ; CHECK-NEXT:    [[TRUNC_1]] = trunc i60 [[TRUNC_0]] to i32
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -864,14 +855,13 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 4, [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[VEC_IND]], splat (i32 12)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 56
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[MUL]], [[ENTRY]] ]
@@ -887,7 +877,7 @@ define void @with_dead_use(i32 %arg, ptr %dst) #1 {
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[PHI]], 65
 ; CHECK-NEXT:    [[TRUNC]] = trunc i64 [[MUL3]] to i32
 ; CHECK-NEXT:    [[DEAD_AND:%.*]] = and i32 [[TRUNC]], 123
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
index 196c755..fea027d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -29,9 +29,8 @@ define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4
-; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP5]], align 2
 ; CHECK-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP1]]
@@ -104,9 +103,8 @@ define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4
-; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP6]], align 2
 ; CHECK-NEXT:    store <4 x i16> [[TMP10]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], [[TMP2]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 7f2544d..6d562be 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -37,9 +37,9 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -523,9 +523,9 @@ define void @interleave_store_double_i64(ptr %dst) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -644,9 +644,9 @@ define void @interleave_store_i64_double_2(ptr %dst) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
index 95258e6..7d018ea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
@@ -52,7 +52,6 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[NEXT_GEP13]], i64 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[NEXT_GEP14]], i64 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[NEXT_GEP15]], i64 4
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[NEXT_GEP16]], i64 4
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[TMP27]], i32 -4
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP28]], i32 -4
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP29]], i32 -4
@@ -109,7 +108,7 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP56]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[M]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL26:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
@@ -126,9 +125,9 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[PTR_IV]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV]], 100
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[P_4_LCSSA:%.*]] = phi ptr [ [[P_4]], %[[LOOP]] ], [ [[TMP31]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[P_4_LCSSA:%.*]] = phi ptr [ [[P_4]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret ptr [[P_4_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 3d5e9a6..e3e3944 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -34,7 +34,7 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; SSE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; SSE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE:       middle.block:
-; SSE-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; SSE-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; SSE:       scalar.ph:
 ; SSE-NEXT:    br label [[FOR_BODY:%.*]]
 ; SSE:       for.cond.cleanup:
@@ -88,7 +88,7 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
-; AVX1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; AVX1-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; AVX1:       scalar.ph:
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.cond.cleanup:
@@ -142,7 +142,7 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
-; AVX2-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; AVX2-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; AVX2:       scalar.ph:
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
index 4ddee9b..78f96ca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
@@ -18,11 +18,10 @@ define i64 @test_pr98660(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP9]], i32 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP9]], i32 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i32 24
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP16]], align 4
@@ -30,7 +29,7 @@ define i64 @test_pr98660(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD3]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP13]], i32 4, <8 x i1> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP9]], i32 4, <8 x i1> [[TMP17]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP14]], i32 4, <8 x i1> [[TMP18]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP15]], i32 4, <8 x i1> [[TMP19]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <8 x i1> [[TMP20]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
index 2871d4d..1a385b6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
@@ -14,16 +14,14 @@ define void @test_tc_17_no_epilogue_vectorization(ptr noalias %src, ptr noalias
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -35,7 +33,7 @@ define void @test_tc_17_no_epilogue_vectorization(ptr noalias %src, ptr noalias
 ; CHECK-NEXT:    store i8 [[VAL]], ptr [[STADDR]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17
-; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -67,11 +65,9 @@ define void @test_tc_18(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -85,11 +81,9 @@ define void @test_tc_18(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i8>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i8>, ptr [[TMP7]], align 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[WIDE_LOAD2]], ptr [[TMP10]], align 64
+; CHECK-NEXT:    store <2 x i8> [[WIDE_LOAD2]], ptr [[TMP9]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 18
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -138,11 +132,9 @@ define void @test_tc_19(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 64
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -156,11 +148,9 @@ define void @test_tc_19(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i8>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i8>, ptr [[TMP7]], align 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[WIDE_LOAD2]], ptr [[TMP10]], align 64
+; CHECK-NEXT:    store <2 x i8> [[WIDE_LOAD2]], ptr [[TMP9]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 18
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -209,20 +199,18 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 64
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 64
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 64
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 12
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[TMP7]], align 64
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[TMP6]], align 64
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD1]], ptr [[TMP8]], align 64
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD2]], ptr [[TMP9]], align 64
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD3]], ptr [[TMP10]], align 64
@@ -239,11 +227,9 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP14]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP13]], align 64
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD5]], ptr [[TMP16]], align 64
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD5]], ptr [[TMP15]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 20
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -293,8 +279,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src,
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index f4cd48d..f615e23 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -39,11 +39,10 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4
@@ -64,10 +63,10 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP18]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP19]], [[BIN_RDX13]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -192,11 +191,10 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP64]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
@@ -216,10 +214,10 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -396,10 +394,10 @@ define i32 @test_invariant_address(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP98]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP99]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP101:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP101]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -667,10 +665,10 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP146]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP147]], [[BIN_RDX37]]
 ; CHECK-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -802,11 +800,10 @@ define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) {
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
 ; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP65]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison)
@@ -962,11 +959,10 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP64]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
@@ -986,10 +982,10 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1230,10 +1226,10 @@ define i32 @test_non_unit_stride(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1359,11 +1355,10 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
@@ -1383,10 +1378,10 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1512,11 +1507,10 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
@@ -1536,10 +1530,10 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1665,11 +1659,10 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
@@ -1689,10 +1682,10 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -1827,11 +1820,10 @@ define i32 @test_constant_max(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
 ; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP65]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4
@@ -1988,11 +1980,10 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
@@ -2012,10 +2003,10 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2142,11 +2133,10 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
@@ -2166,10 +2156,10 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2306,11 +2296,10 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
 ; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
@@ -2330,10 +2319,10 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP77]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -2531,7 +2520,7 @@ define i32 @test_stride_three(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -2551,9 +2540,9 @@ define i32 @test_stride_three(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -2660,7 +2649,7 @@ define i32 @test_non_unit_stride_four(i64 %len, ptr %test_base) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP57]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP59]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -2680,9 +2669,9 @@ define i32 @test_non_unit_stride_four(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP59]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -2853,7 +2842,7 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 80, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -2873,9 +2862,9 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -3046,7 +3035,7 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -3066,9 +3055,9 @@ define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -3256,7 +3245,7 @@ define i32 @test_non_unit_stride_with_first_iteration_step_access(i64 %len, ptr
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP130]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP131]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP133:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 288, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP133]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -3276,9 +3265,9 @@ define i32 @test_non_unit_stride_with_first_iteration_step_access(i64 %len, ptr
 ; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 300
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP39:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP39:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP133]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
index 612f996..af93985 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
@@ -35,11 +35,10 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc <8 x i64> [[TMP13]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc <8 x i64> [[TMP14]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc <8 x i64> [[TMP15]] to <8 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP16]], i32 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP16]], i32 16
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP16]], i32 24
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP24]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP16]], i32 4, <8 x i1> [[TMP8]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP9]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP10]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP11]])
@@ -68,8 +67,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP35]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP33]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP33]])
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100
@@ -134,13 +132,12 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP1]], i32 4, <8 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[NEXT_GEP]], i32 4, <8 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -156,7 +153,7 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c)
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 4
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 116
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -204,9 +201,8 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali
 ; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i32> [[WIDE_VEC4]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC]], splat (i32 123)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC5]], splat (i32 123)
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP10]], i32 4, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[NEXT_GEP]], i32 4, <4 x i1> [[TMP8]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 2b6490a..5065dc8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -36,23 +36,20 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP9]], i32 4, <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
-; AVX1-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX1-NEXT:    br label [[FOR_END:%.*]]
 ; AVX1:       scalar.ph:
-; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -94,11 +91,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 24
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
@@ -107,11 +103,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
@@ -120,11 +115,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
 ; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 8
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr [[TMP22]], i32 4, <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr [[TMP21]], i32 4, <8 x i1> [[TMP8]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <8 x i1> [[TMP9]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <8 x i1> [[TMP10]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP11]])
@@ -141,16 +135,13 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2:       vec.epilog.vector.body:
 ; AVX2-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP29]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP31]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP30]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP34]], i32 4, <8 x i1> [[TMP30]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -199,11 +190,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 32
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 48
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
@@ -212,11 +202,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
@@ -225,11 +214,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
 ; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 32
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr [[TMP22]], i32 4, <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr [[TMP21]], i32 4, <16 x i1> [[TMP8]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <16 x i1> [[TMP9]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <16 x i1> [[TMP10]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP11]])
@@ -246,16 +234,13 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       vec.epilog.vector.body:
 ; AVX512-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP29]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP28]], align 4
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP32]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP31]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <16 x i1> [[TMP30]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr [[TMP34]], i32 4, <16 x i1> [[TMP30]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -332,23 +317,20 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP6]], i32 0
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP9]], i32 0
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP10]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], i32 4, <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX1:       middle.block:
-; AVX1-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX1-NEXT:    br label [[FOR_END:%.*]]
 ; AVX1:       scalar.ph:
-; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -390,11 +372,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 8
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 16
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 24
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP3]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP5]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP6]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP7]], align 4
@@ -403,11 +384,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 0
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
@@ -416,11 +396,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
 ; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 0
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 8
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) [[TMP22]], i32 4, <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) [[TMP21]], i32 4, <8 x i1> [[TMP8]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP9]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP10]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP11]])
@@ -437,16 +416,13 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2:       vec.epilog.vector.body:
 ; AVX2-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP38]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP29]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP38]], align 4
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP31]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP31]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP34]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP30]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP34]], i32 4, <8 x i1> [[TMP30]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -495,11 +471,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 16
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 32
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 48
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP4]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP3]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP5]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP6]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP7]], align 4
@@ -508,11 +483,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 0
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
@@ -521,11 +495,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
 ; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 0
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 32
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) [[TMP22]], i32 4, <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) [[TMP21]], i32 4, <16 x i1> [[TMP8]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP9]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP10]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP11]])
@@ -542,16 +515,13 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512:       vec.epilog.vector.body:
 ; AVX512-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP29]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP28]], align 4
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP31]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP31]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP34]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <16 x i1> [[TMP30]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) [[TMP34]], i32 4, <16 x i1> [[TMP30]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -637,24 +607,21 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 0
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
 ; AVX1-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
 ; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
-; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP11]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX1:       middle.block:
-; AVX1-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX1-NEXT:    br label [[FOR_END:%.*]]
 ; AVX1:       scalar.ph:
-; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -697,11 +664,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 24
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
@@ -710,11 +676,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x float> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x float> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x float> poison)
@@ -727,11 +692,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP23:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP19]]
 ; AVX2-NEXT:    [[TMP24:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP20]]
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr float, ptr [[TMP25]], i32 0
 ; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 8
 ; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
 ; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP8]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP22]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP9]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP23]], ptr [[TMP28]], i32 4, <8 x i1> [[TMP10]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr [[TMP29]], i32 4, <8 x i1> [[TMP11]])
@@ -748,17 +712,14 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2:       vec.epilog.vector.body:
 ; AVX2-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[FOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP33]], align 4
+; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP43]], align 4
 ; AVX2-NEXT:    [[TMP34:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr float, ptr [[TMP35]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP36]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP35]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float>
 ; AVX2-NEXT:    [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr float, ptr [[TMP39]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP34]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP39]], i32 4, <8 x i1> [[TMP34]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -808,11 +769,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 32
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 48
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
 ; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
@@ -821,11 +781,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x float> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x float> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x float> poison)
@@ -838,11 +797,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP19]]
 ; AVX512-NEXT:    [[TMP24:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP20]]
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr float, ptr [[TMP25]], i32 0
 ; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 32
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr [[TMP26]], i32 4, <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP8]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP22]], ptr [[TMP27]], i32 4, <16 x i1> [[TMP9]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP23]], ptr [[TMP28]], i32 4, <16 x i1> [[TMP10]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP24]], ptr [[TMP29]], i32 4, <16 x i1> [[TMP11]])
@@ -859,17 +817,14 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       vec.epilog.vector.body:
 ; AVX512-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP33]], align 4
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP32]], align 4
 ; AVX512-NEXT:    [[TMP34:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr float, ptr [[TMP35]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP36]], i32 4, <16 x i1> [[TMP34]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP35]], i32 4, <16 x i1> [[TMP34]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float>
 ; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr float, ptr [[TMP39]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr [[TMP40]], i32 4, <16 x i1> [[TMP34]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr [[TMP39]], i32 4, <16 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -959,11 +914,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
 ; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
 ; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META8:![0-9]+]]
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META8:![0-9]+]]
 ; AVX1-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META8]]
 ; AVX1-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META8]]
 ; AVX1-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META8]]
@@ -972,11 +926,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
 ; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
 ; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP10]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]]
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]]
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]]
@@ -989,11 +942,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
 ; AVX1-NEXT:    [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
 ; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
 ; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
 ; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP23]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]]
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]]
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]]
@@ -1001,9 +953,9 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; AVX1:       middle.block:
-; AVX1-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX1-NEXT:    br label [[FOR_END:%.*]]
 ; AVX1:       scalar.ph:
-; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -1046,11 +998,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META11:![0-9]+]]
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META11:![0-9]+]]
 ; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META11]]
 ; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META11]]
 ; AVX2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META11]]
@@ -1059,11 +1010,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META14:![0-9]+]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP10]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META14:![0-9]+]]
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META14]]
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META14]]
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META14]]
@@ -1076,11 +1026,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
 ; AVX2-NEXT:    [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
 ; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP23]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META16]], !noalias [[META18]]
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META16]], !noalias [[META18]]
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META16]], !noalias [[META18]]
@@ -1088,9 +1037,9 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX2-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; AVX2:       middle.block:
-; AVX2-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX2-NEXT:    br label [[FOR_END:%.*]]
 ; AVX2:       scalar.ph:
-; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -1135,11 +1084,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 16
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 24
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META11:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META11:![0-9]+]]
 ; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META11]]
 ; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META11]]
 ; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META11]]
@@ -1148,11 +1096,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD8]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 16
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP11]], i32 8, <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META14:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP10]], i32 8, <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META14:![0-9]+]]
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP12]], i32 8, <8 x i1> [[TMP7]], <8 x double> poison), !alias.scope [[META14]]
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double> poison), !alias.scope [[META14]]
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP14]], i32 8, <8 x i1> [[TMP9]], <8 x double> poison), !alias.scope [[META14]]
@@ -1165,11 +1112,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
 ; AVX512-NEXT:    [[TMP22:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 16
 ; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr [[TMP24]], i32 8, <8 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr [[TMP23]], i32 8, <8 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP20]], ptr [[TMP25]], i32 8, <8 x i1> [[TMP7]]), !alias.scope [[META16]], !noalias [[META18]]
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP21]], ptr [[TMP26]], i32 8, <8 x i1> [[TMP8]]), !alias.scope [[META16]], !noalias [[META18]]
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP22]], ptr [[TMP27]], i32 8, <8 x i1> [[TMP9]]), !alias.scope [[META16]], !noalias [[META18]]
@@ -1186,20 +1132,17 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       vec.epilog.vector.body:
 ; AVX512-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[FOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDEX12]]
-; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP31]], align 4, !alias.scope [[META20:![0-9]+]]
-; AVX512-NEXT:    [[TMP32:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], splat (i32 100)
-; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX12]]
-; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[TMP33]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP34]], i32 8, <8 x i1> [[TMP32]], <8 x double> poison), !alias.scope [[META23:![0-9]+]]
-; AVX512-NEXT:    [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
-; AVX512-NEXT:    [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD14]], [[TMP35]]
-; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX12]]
-; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP37]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP36]], ptr [[TMP38]], i32 8, <8 x i1> [[TMP32]]), !alias.scope [[META25:![0-9]+]], !noalias [[META27:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP41]], align 4, !alias.scope [[META11]]
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], splat (i32 100)
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX12]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP32]], i32 8, <8 x i1> [[TMP31]], <8 x double> poison), !alias.scope [[META14]]
+; AVX512-NEXT:    [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
+; AVX512-NEXT:    [[TMP34:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD14]], [[TMP33]]
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX12]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP34]], ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]]), !alias.scope [[META16]], !noalias [[META18]]
 ; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
 ; AVX512-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
-; AVX512-NEXT:    br i1 [[TMP39]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP39]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
 ; AVX512:       vec.epilog.scalar.ph:
@@ -1222,7 +1165,7 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP29:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP21:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1312,21 +1255,21 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META30:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META22:![0-9]+]]
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], splat (i64 1)
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META33:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META25:![0-9]+]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
 ; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]]
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META35:![0-9]+]], !noalias [[META37:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META27:![0-9]+]], !noalias [[META29:![0-9]+]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 128)
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
-; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; AVX512:       middle.block:
-; AVX512-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX512-NEXT:    br label [[SCALAR_PH]]
 ; AVX512:       scalar.ph:
 ; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
@@ -1348,7 +1291,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP39:![0-9]+]]
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1505,9 +1448,9 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX2-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; AVX2:       middle.block:
-; AVX2-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX2-NEXT:    br label [[FOR_END:%.*]]
 ; AVX2:       scalar.ph:
-; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
+; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -1558,13 +1501,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -7
 ; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -24
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -7
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META40:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META32:![0-9]+]]
 ; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META40]]
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META32]]
 ; AVX512-NEXT:    [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META40]]
+; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META32]]
 ; AVX512-NEXT:    [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META40]]
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META32]]
 ; AVX512-NEXT:    [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP10:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
 ; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer
@@ -1580,16 +1523,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -24
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -7
 ; AVX512-NEXT:    [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META43:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META35:![0-9]+]]
 ; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META43]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META35]]
 ; AVX512-NEXT:    [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META43]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META35]]
 ; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META43]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META35]]
 ; AVX512-NEXT:    [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP23:%.*]] = fadd <8 x double> [[REVERSE13]], splat (double 5.000000e-01)
 ; AVX512-NEXT:    [[TMP24:%.*]] = fadd <8 x double> [[REVERSE16]], splat (double 5.000000e-01)
@@ -1605,20 +1548,20 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -24
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -7
 ; AVX512-NEXT:    [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META45:![0-9]+]], !noalias [[META47:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META37:![0-9]+]], !noalias [[META39:![0-9]+]]
 ; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META45]], !noalias [[META47]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META37]], !noalias [[META39]]
 ; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META45]], !noalias [[META47]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META37]], !noalias [[META39]]
 ; AVX512-NEXT:    [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META45]], !noalias [[META47]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META37]], !noalias [[META39]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; AVX512:       middle.block:
-; AVX512-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; AVX512-NEXT:    br label [[FOR_END:%.*]]
 ; AVX512:       scalar.ph:
-; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
+; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -1636,7 +1579,7 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; AVX512-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1694,11 +1637,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
@@ -1711,11 +1653,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
 ; AVX1-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
 ; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 0
 ; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 4
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 8
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
@@ -1728,11 +1669,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
 ; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
@@ -1754,18 +1694,15 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1:       vec.epilog.vector.body:
 ; AVX1-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
-; AVX1-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP56]], align 1
 ; AVX1-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX1-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX1-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -1818,11 +1755,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
@@ -1835,11 +1771,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
 ; AVX2-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
 ; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP18]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
@@ -1852,11 +1787,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
@@ -1878,18 +1812,15 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2:       vec.epilog.vector.body:
 ; AVX2-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP56]], align 1
 ; AVX2-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX2-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX2-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
 ; AVX2-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX2-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
@@ -1942,11 +1873,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 24
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
@@ -1959,11 +1889,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
 ; AVX512-NEXT:    [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
 ; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP19]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP18]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
@@ -1976,17 +1905,16 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP17]], <8 x i1> [[TMP30]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <8 x i1> [[TMP31]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -2002,21 +1930,18 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512:       vec.epilog.vector.body:
 ; AVX512-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP56]], align 1
 ; AVX512-NEXT:    [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX512-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP47]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX512-NEXT:    [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <8 x i1> [[TMP51]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <8 x i1> [[TMP51]])
 ; AVX512-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
 ; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
-; AVX512-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
 ; AVX512-NEXT:    br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -2042,7 +1967,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP52:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP44:![0-9]+]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
@@ -2111,11 +2036,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
@@ -2128,11 +2052,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
 ; AVX1-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
 ; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
 ; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP18]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
@@ -2145,11 +2068,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
 ; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
@@ -2171,18 +2093,15 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1:       vec.epilog.vector.body:
 ; AVX1-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
-; AVX1-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
+; AVX1-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP56]], align 1
 ; AVX1-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX1-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[TMP45]], i32 0
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP45]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX1-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -2235,11 +2154,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
 ; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
@@ -2252,11 +2170,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer
 ; AVX2-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 0
 ; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 4
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 8
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP12]], <4 x ptr> poison)
@@ -2269,11 +2186,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP34:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
@@ -2295,18 +2211,15 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2:       vec.epilog.vector.body:
 ; AVX2-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
 ; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
-; AVX2-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1
+; AVX2-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP56]], align 1
 ; AVX2-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX2-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX2-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <4 x i1> [[TMP51]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
 ; AVX2-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX2-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
@@ -2359,11 +2272,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 24
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
 ; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
@@ -2376,11 +2288,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer
 ; AVX512-NEXT:    [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer
 ; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP19]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP18]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
@@ -2393,17 +2304,16 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP17]], <8 x i1> [[TMP30]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[INDEX]]
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP36]], i32 8, <8 x i1> [[TMP31]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -2419,21 +2329,18 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512:       vec.epilog.vector.body:
 ; AVX512-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[FOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP56]], align 1
 ; AVX512-NEXT:    [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX512-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP47]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX512-NEXT:    [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP53]], i32 8, <8 x i1> [[TMP51]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <8 x i1> [[TMP51]])
 ; AVX512-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
 ; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
-; AVX512-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP54]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]]
 ; AVX512-NEXT:    br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -2459,7 +2366,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP55:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP47:![0-9]+]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index c1d3412..f26064a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1186,24 +1186,22 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O1VEC2:       vector.body:
 ; O1VEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[B:%.*]], i64 [[INDEX]]
-; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
 ; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 4
-; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O1VEC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; O1VEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O1VEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
 ; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A:%.*]], i64 [[INDEX]]
-; O1VEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i32 0
 ; O1VEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i32 4
-; O1VEC2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4
+; O1VEC2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP8]], align 4
 ; O1VEC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; O1VEC2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; O1VEC2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; O1VEC2:       middle.block:
-; O1VEC2-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; O1VEC2-NEXT:    br label [[FOR_END:%.*]]
 ; O1VEC2:       scalar.ph:
-; O1VEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; O1VEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
 ; O1VEC2:       for.body:
 ; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1229,24 +1227,22 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; OzVEC2:       vector.body:
 ; OzVEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[B:%.*]], i64 [[INDEX]]
-; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
 ; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 4
-; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; OzVEC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; OzVEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; OzVEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
 ; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A:%.*]], i64 [[INDEX]]
-; OzVEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i32 0
 ; OzVEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i32 4
-; OzVEC2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP7]], align 4
+; OzVEC2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP8]], align 4
 ; OzVEC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; OzVEC2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; OzVEC2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OzVEC2:       middle.block:
-; OzVEC2-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OzVEC2-NEXT:    br label [[FOR_END:%.*]]
 ; OzVEC2:       scalar.ph:
-; OzVEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OzVEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
 ; OzVEC2:       for.body:
 ; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index 074aeb8..07e2df3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -22,11 +22,10 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], splat (i32 202)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> splat (i8 2), <64 x i8> splat (i8 1)
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -60,11 +59,10 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202)
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1)
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -120,11 +118,10 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], splat (i32 202)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> splat (i8 2), <64 x i8> splat (i8 1)
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -158,11 +155,10 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202)
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1)
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP3]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -222,16 +218,15 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <64 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0(<64 x ptr> [[TMP2]], i32 4, <64 x i1> splat (i1 true), <64 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <64 x i32> [[VEC_IND]], splat (i32 64)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -260,16 +255,15 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <8 x i32> [[TMP1]]
 ; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
 ; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; AUTOVF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP4]], align 4
+; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; AUTOVF-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; AUTOVF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AUTOVF:       middle.block:
-; AUTOVF-NEXT:    br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; AUTOVF-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; AUTOVF:       scalar.ph:
-; AUTOVF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; AUTOVF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
 ; AUTOVF-NEXT:    br label [[FOR_BODY:%.*]]
 ; AUTOVF:       for.body:
 ; AUTOVF-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index f1cbb4a..01d11cc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -108,7 +108,7 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
index 0c99ff6..0cbfb34 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
@@ -46,36 +46,23 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF10:.*]], label %[[PRED_SDIV_CONTINUE11:.*]]
 ; CHECK:       [[PRED_SDIV_IF10]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> poison, i16 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE11]]
 ; CHECK:       [[PRED_SDIV_CONTINUE11]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i16> [ poison, %[[PRED_SDIV_CONTINUE9]] ], [ [[TMP8]], %[[PRED_SDIV_IF10]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_SDIV_IF12:.*]], label %[[PRED_SDIV_CONTINUE13:.*]]
 ; CHECK:       [[PRED_SDIV_IF12]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP11]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE13]]
 ; CHECK:       [[PRED_SDIV_CONTINUE13]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i16> [ [[TMP9]], %[[PRED_SDIV_CONTINUE11]] ], [ [[TMP12]], %[[PRED_SDIV_IF12]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF14:.*]], label %[[PRED_SDIV_CONTINUE15:.*]]
 ; CHECK:       [[PRED_SDIV_IF14]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> [[TMP13]], i16 [[TMP15]], i32 2
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE15]]
 ; CHECK:       [[PRED_SDIV_CONTINUE15]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i16> [ [[TMP13]], %[[PRED_SDIV_CONTINUE13]] ], [ [[TMP16]], %[[PRED_SDIV_IF14]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF16:.*]], label %[[PRED_SDIV_CONTINUE17]]
 ; CHECK:       [[PRED_SDIV_IF16]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i32 3
 ; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE17]]
 ; CHECK:       [[PRED_SDIV_CONTINUE17]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i16> [ [[TMP17]], %[[PRED_SDIV_CONTINUE15]] ], [ [[TMP20]], %[[PRED_SDIV_IF16]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> zeroinitializer, <4 x i16> [[TMP21]]
 ; CHECK-NEXT:    [[TMP24]] = or <4 x i1> [[VEC_PHI]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25]] = or <4 x i1> [[VEC_PHI3]], [[TMP22]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -86,8 +73,7 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]])
 ; CHECK-NEXT:    [[TMP28:%.*]] = freeze i1 [[TMP27]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP28]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 97, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
@@ -104,10 +90,10 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[SEL]] = select i1 [[C]], i32 [[B]], i32 0
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[BLEND_LCSSA:%.*]] = phi i16 [ [[BLEND]], %[[LOOP_LATCH]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP_LATCH]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BLEND_LCSSA:%.*]] = phi i16 [ [[BLEND]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    store i16 [[BLEND_LCSSA]], ptr [[P]], align 2
 ; CHECK-NEXT:    ret i32 [[SEL_LCSSA]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll b/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
index c02ec91..18f585b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
@@ -23,7 +23,7 @@ define void @no_use() {
 ; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -33,7 +33,7 @@ define void @no_use() {
 ; CHECK-NEXT:    [[E_0_I]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC_I:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[INC_I]] = add i32 [[E_0_I]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[E_0_I]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -67,7 +67,7 @@ define void @dead_use() {
 ; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 40, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -78,7 +78,7 @@ define void @dead_use() {
 ; CHECK-NEXT:    [[DEAD:%.*]] = add i32 [[D_0_I]], 1
 ; CHECK-NEXT:    [[INC_I]] = add i32 [[E_0_I]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[E_0_I]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
index 9a70ed4..c1d08e1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
@@ -105,9 +105,9 @@ define i8 @pr141968(i1 %cond, i8 %v) {
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 15
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index 753ef69..df2e35d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -16,18 +16,15 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2:%.*]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index 59317fa..639fb86 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -71,9 +71,8 @@ define i32 @main(ptr %ptr) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[DOTPROMOTED]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP20]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 4
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP22]], align 4
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
index 52832ed..aad7869 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
@@ -27,17 +27,16 @@ define void @foo(ptr %ptr, ptr %ptr.2) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 3
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[PTR_2]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[VEC_IND]], ptr [[TMP7]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store <4 x i64> [[VEC_IND]], ptr [[TMP6]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 80, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 82, [[MIDDLE_BLOCK]] ], [ 2, [[ENTRY]] ], [ 2, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ 2, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[CAN_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[CAN_IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index 71e000a..d2f8f22 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -37,8 +37,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; SSE2-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP9]]
 ; SSE2-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP8]]
 ; SSE2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]]
-; SSE2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; SSE2-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
+; SSE2-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP13]], align 4
 ; SSE2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SSE2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SSE2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -125,9 +124,8 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; SSE41-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP16]]
 ; SSE41-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP17]]
 ; SSE41-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]]
-; SSE41-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
 ; SSE41-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 4
-; SSE41-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4
+; SSE41-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP26]], align 4
 ; SSE41-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP29]], align 4
 ; SSE41-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SSE41-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -215,9 +213,8 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]]
 ; AVX1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]]
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
 ; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 4
-; AVX1-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP25]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP26]], align 4
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -288,8 +285,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX2-NEXT:    [[TMP11:%.*]] = mul nsw <8 x i32> [[TMP10]], [[TMP9]]
 ; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[TMP11]], [[TMP8]]
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP14]], align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP13]], align 4
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
index db9be20..c8cf9fb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
@@ -32,14 +32,13 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> zeroinitializer, <2 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label %vector.body
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label %exit, label %scalar.ph
+; CHECK-NEXT:    br label %exit
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[GEP_LCSSA:%.*]] = phi ptr [ %gep.src, %loop.latch ], [ [[TMP2]], %middle.block ]
 ; CHECK-NEXT:    ret ptr [[GEP_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
index c405e82..00fe141 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
@@ -21,8 +21,7 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <96 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <32 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    store <32 x i8> [[STRIDED_VEC]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -39,8 +38,7 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <6 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <6 x i8> [[WIDE_VEC2]], <6 x i8> poison, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    store <2 x i8> [[STRIDED_VEC3]], ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 36
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
index c2668f0..d695de6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -24,12 +24,11 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; COST-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
 ; COST-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]]
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP6]], i32 1, <4 x i1> [[TMP10]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP10]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -77,9 +76,8 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; FORCED-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 -12)
@@ -87,7 +85,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 13)
 ; FORCED-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]]
 ; FORCED-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -161,14 +159,13 @@ define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr
 ; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; COST-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
 ; COST-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
 ; COST-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
 ; COST-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP7]]
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP6]], i32 1, <4 x i1> [[TMP11]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -217,9 +214,8 @@ define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; FORCED-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 -12)
@@ -231,7 +227,7 @@ define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr
 ; FORCED-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP9]]
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP10]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP17]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP17]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP18]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -341,9 +337,8 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; FORCED-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 -12)
@@ -357,13 +352,13 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP14]]
 ; FORCED-NEXT:    [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP7]], i32 1, <4 x i1> [[TMP13]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP13]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 1, <4 x i1> [[TMP14]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP9]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP7]], i32 1, <4 x i1> [[TMP19]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP19]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -460,8 +455,7 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; COST-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
 ; COST-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
@@ -469,10 +463,10 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; COST-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
 ; COST-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
 ; COST-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP6]], i32 1, <4 x i1> [[TMP13]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP13]])
 ; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP14]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP6]], i32 1, <4 x i1> [[TMP7]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP7]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -533,9 +527,8 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; FORCED-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 -12)
@@ -551,13 +544,13 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; FORCED-NEXT:    [[TMP20:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP19]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP22:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP7]], i32 1, <4 x i1> [[TMP21]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP21]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 1, <4 x i1> [[TMP22]])
 ; FORCED-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP23]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP23]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP24]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP9]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -694,9 +687,8 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; FORCED-NEXT:    [[TMP24:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 -12)
@@ -718,11 +710,11 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP28]], [[TMP36]]
 ; FORCED-NEXT:    [[TMP39:%.*]] = xor <4 x i1> [[TMP37]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP40:%.*]] = xor <4 x i1> [[TMP38]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP35]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP35]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP36]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP27]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP27]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP28]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP7]], i32 1, <4 x i1> [[TMP39]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP39]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP40]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -844,9 +836,8 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP15:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; FORCED-NEXT:    [[TMP16:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 -12)
@@ -856,11 +847,11 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP12]]
 ; FORCED-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP17]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP21:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP7]], i32 1, <4 x i1> [[TMP20]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP20]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP21]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -952,8 +943,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; COST-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; COST-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; COST-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
+; COST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; COST-NEXT:    [[TMP7:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; COST-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
@@ -962,9 +952,9 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]]
 ; COST-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
 ; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP11]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP6]], i32 1, <4 x i1> [[TMP10]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP6]], i32 1, <4 x i1> [[TMP14]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP10]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -1024,9 +1014,8 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP9:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; FORCED-NEXT:    [[TMP10:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]]
@@ -1044,11 +1033,11 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[TMP20]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP22]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP25]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP25]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP26]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP7]], i32 1, <4 x i1> [[TMP23]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP23]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP24]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1178,9 +1167,8 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; FORCED-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 -12)
@@ -1198,15 +1186,15 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[TMP28:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP20]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP25]]
 ; FORCED-NEXT:    [[TMP30:%.*]] = or <4 x i1> [[TMP28]], [[TMP26]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP29]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP29]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP30]])
 ; FORCED-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP17]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP33:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP32]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP32]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP33]])
 ; FORCED-NEXT:    [[TMP36:%.*]] = or <4 x i1> [[TMP32]], [[TMP15]]
 ; FORCED-NEXT:    [[TMP37:%.*]] = or <4 x i1> [[TMP33]], [[TMP16]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP7]], i32 1, <4 x i1> [[TMP36]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP36]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP37]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1331,9 +1319,8 @@ define void @large_number_of_cases(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
-; FORCED-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4
-; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1
+; FORCED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[NEXT_GEP]], align 1
 ; FORCED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1
 ; FORCED-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 1)
 ; FORCED-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 1)
@@ -1369,7 +1356,7 @@ define void @large_number_of_cases(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP40:%.*]] = or <4 x i1> [[TMP38]], [[TMP24]]
 ; FORCED-NEXT:    [[TMP57:%.*]] = or <4 x i1> [[TMP39]], [[TMP25]]
 ; FORCED-NEXT:    [[TMP58:%.*]] = or <4 x i1> [[TMP40]], [[TMP26]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP7]], i32 1, <4 x i1> [[TMP57]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP57]])
 ; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP58]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
index 945d500..3a2d7e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -34,9 +34,8 @@ define void @pr15344(ptr noalias %ar, ptr noalias %ar2, i32 %exit.limit, i1 %con
 ; CHECK-NEXT:    [[TMP2]] = fadd fast <2 x double> [[VEC_PHI]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[TMP3]] = fadd fast <2 x double> [[VEC_PHI2]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[AR2]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 2
-; CHECK-NEXT:    store <2 x float> splat (float 2.000000e+00), ptr [[TMP5]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    store <2 x float> splat (float 2.000000e+00), ptr [[TMP4]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    store <2 x float> splat (float 2.000000e+00), ptr [[TMP6]], align 4, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index 9388aac..0e83cf3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -60,9 +60,8 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP7]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -72,10 +71,10 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -126,9 +125,8 @@ define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP7]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -138,10 +136,10 @@ define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ -0.000000e+00, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -192,9 +190,8 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array)
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP7]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
@@ -204,10 +201,10 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array)
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ -0.000000e+00, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -265,9 +262,8 @@ define float @PR35538(ptr nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+00), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+00), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -347,9 +343,8 @@ define float @PR35538_more_FMF(ptr nocapture readonly %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+00), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+00), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index e588335..1c6a225 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -20,7 +20,7 @@ define void @replicate_udiv_with_only_first_lane_used(i32 %x, ptr %dst, i64 %d)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -36,7 +36,7 @@ define void @replicate_udiv_with_only_first_lane_used(i32 %x, ptr %dst, i64 %d)
 ; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 101
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -172,7 +172,7 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -188,7 +188,7 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d)
 ; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 101
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index ed1c67c..0b61f20 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -53,9 +53,9 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
index 1bcaa21..440f6e1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
@@ -32,8 +32,7 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr [[TMP2]], i32 1, <16 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr [[DST]], i32 1, <16 x i1> [[TMP0]])
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT_1_LOOPEXIT1:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index 5917b30..c67b02b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -39,7 +39,7 @@ define void @example1() optsize {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[TMP7:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP6:%.*]]
 ; CHECK:       6:
@@ -418,7 +418,7 @@ define void @example23b(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[TMP5:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[TMP5:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP4:%.*]]
 ; CHECK:       4:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index ad8f1fb..33b173d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -58,11 +58,10 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
 ; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4, !tbaa [[TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
@@ -198,8 +197,7 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP174:%.*]] = add i64 [[INDEX9]], 2
 ; CHECK-NEXT:    [[TMP175:%.*]] = add i64 [[INDEX9]], 3
 ; CHECK-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]]
-; CHECK-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i32, ptr [[TMP152]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP153]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP152]], align 4, !tbaa [[TBAA1]]
 ; CHECK-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]]
@@ -290,11 +288,10 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
 ; MAX-BW-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
 ; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
-; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
 ; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
 ; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
 ; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
-; MAX-BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; MAX-BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP32]], align 4, !tbaa [[TBAA1:![0-9]+]]
 ; MAX-BW-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
@@ -430,8 +427,7 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP174:%.*]] = add i64 [[INDEX9]], 2
 ; MAX-BW-NEXT:    [[TMP175:%.*]] = add i64 [[INDEX9]], 3
 ; MAX-BW-NEXT:    [[TMP152:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP172]]
-; MAX-BW-NEXT:    [[TMP153:%.*]] = getelementptr inbounds i32, ptr [[TMP152]], i32 0
-; MAX-BW-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP153]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i32>, ptr [[TMP152]], align 4, !tbaa [[TBAA1]]
 ; MAX-BW-NEXT:    [[TMP154:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP172]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP155:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP173]], i64 [[IDXPROM5]]
 ; MAX-BW-NEXT:    [[TMP156:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP174]], i64 [[IDXPROM5]]
@@ -558,9 +554,9 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -678,9 +674,9 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; MAX-BW-NEXT:    br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; MAX-BW:       middle.block:
-; MAX-BW-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; MAX-BW-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; MAX-BW:       scalar.ph:
-; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; MAX-BW-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-BW:       for.body:
 ; MAX-BW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index 5fc9e64..5e35c4a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -18,15 +18,12 @@ define dso_local void @tail_folding_enabled(ptr noalias nocapture %A, ptr noalia
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 429)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -85,15 +82,12 @@ define dso_local void @tail_folding_disabled(ptr noalias nocapture %A, ptr noali
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 429)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -170,11 +164,9 @@ define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B,
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP10]] = add <8 x i32> [[TMP9]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index cb7f0bf..2a3ce03 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -104,8 +104,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
@@ -183,8 +182,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
@@ -262,8 +260,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
@@ -341,8 +338,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
@@ -422,19 +418,17 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds i64, ptr [[SRC_0]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[GEP_SRC_0]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP5]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[GEP_SRC_0]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC_1]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -509,8 +503,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
index 074ea6b..62d08c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
@@ -25,11 +25,10 @@ define void @foo(ptr nocapture noalias %A, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @inc, align 4
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[TMP1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i32 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i32 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i32 24
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP6]], align 4
@@ -37,11 +36,10 @@ define void @foo(ptr nocapture noalias %A, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <8 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i32 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i32 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i32 24
-; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[A]], align 4
 ; CHECK-NEXT:    store <8 x float> [[TMP8]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <8 x float> [[TMP10]], ptr [[TMP13]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index 8c91c31..68b4f20 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -16,23 +16,22 @@ define i32 @uniform_load(ptr align(4) %addr) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[LOAD_LCSSA]]
 ;
 entry:
@@ -76,7 +75,7 @@ define i32 @uniform_load2(ptr align(4) %addr) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -88,9 +87,9 @@ define i32 @uniform_load2(ptr align(4) %addr) {
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -114,30 +113,27 @@ define i32 @uniform_address(ptr align(4) %addr, i32 %byte_offset) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[OFFSET:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[OFFSET]]
+; CHECK-NEXT:    [[OFFSET:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[OFFSET]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[LOAD_LCSSA]]
 ;
 entry:
@@ -171,7 +167,7 @@ define void @uniform_store_uniform_value(ptr align(4) %addr) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -180,7 +176,7 @@ define void @uniform_store_uniform_value(ptr align(4) %addr) {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void
 ;
@@ -216,7 +212,7 @@ define void @uniform_store_varying_value(ptr align(4) %addr) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -226,7 +222,7 @@ define void @uniform_store_varying_value(ptr align(4) %addr) {
 ; CHECK-NEXT:    store i32 [[IV_I32]], ptr [[ADDR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void
 ;
@@ -297,7 +293,7 @@ define void @uniform_copy(ptr %A, ptr %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -307,7 +303,7 @@ define void @uniform_copy(ptr %A, ptr %B) {
 ; CHECK-NEXT:    store i32 [[LOAD]], ptr [[B]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void
 ;
@@ -408,10 +404,10 @@ define i32 @test_count_bits(ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX13]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP41]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -486,7 +482,7 @@ define i32 @uniform_load_global() {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -498,9 +494,9 @@ define i32 @uniform_load_global() {
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
@@ -547,7 +543,7 @@ define i32 @uniform_load_constexpr() {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -559,9 +555,9 @@ define i32 @uniform_load_constexpr() {
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       loopexit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index d7a3b32..f7eba42 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -29,20 +29,18 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]]
@@ -50,7 +48,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP7]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP6]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    store <4 x float> [[TMP13]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    store <4 x float> [[TMP14]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP0]]
@@ -67,13 +65,11 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX8]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX8]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fadd fast <4 x float> [[WIDE_LOAD9]], [[WIDE_LOAD10]]
-; CHECK-NEXT:    store <4 x float> [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    store <4 x float> [[TMP21]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -136,13 +132,11 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP5]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -200,20 +194,18 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP7]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
index 27321e7..59f2925 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -28,15 +28,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <16 x i1> [[TMP1]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -74,20 +71,18 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
 ; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 32
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 48
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
 ; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 16
 ; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 32
 ; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 48
-; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP12]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP17]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP18]], align 4
 ; NO-VP-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP19]], align 4
@@ -96,11 +91,10 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
 ; NO-VP-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
 ; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
 ; NO-VP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 16
 ; NO-VP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 32
 ; NO-VP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 48
-; NO-VP-NEXT:    store <16 x i32> [[TMP20]], ptr [[TMP28]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP20]], ptr [[TMP24]], align 4
 ; NO-VP-NEXT:    store <16 x i32> [[TMP21]], ptr [[TMP29]], align 4
 ; NO-VP-NEXT:    store <16 x i32> [[TMP22]], ptr [[TMP30]], align 4
 ; NO-VP-NEXT:    store <16 x i32> [[TMP23]], ptr [[TMP31]], align 4
@@ -122,15 +116,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP:       vec.epilog.vector.body:
 ; NO-VP-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX12]]
-; NO-VP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP35]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP34]], align 4
 ; NO-VP-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX12]]
-; NO-VP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4
 ; NO-VP-NEXT:    [[TMP38:%.*]] = add nsw <8 x i32> [[WIDE_LOAD14]], [[WIDE_LOAD13]]
 ; NO-VP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX12]]
-; NO-VP-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 0
-; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP40]], align 4
+; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP39]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
 ; NO-VP-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
 ; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
index 7b1c60b..e9d85c2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
@@ -71,8 +71,8 @@ define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <48 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false>
 ; CHECK-NEXT:    call void @llvm.masked.store.v48i8.p0(<48 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison>, ptr [[TMP13]], i32 1, <48 x i1> [[TMP15]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10008
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10008
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
index 5c34971..05d08a4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
@@ -18,11 +18,10 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
@@ -37,11 +36,10 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i64, ptr [[TMP28]], i32 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP28]], i32 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i64, ptr [[TMP28]], i32 12
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP12]], ptr [[TMP32]], i32 4, <4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP12]], ptr [[TMP28]], i32 4, <4 x i1> [[TMP16]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP13]], ptr [[TMP33]], i32 4, <4 x i1> [[TMP17]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP14]], ptr [[TMP34]], i32 4, <4 x i1> [[TMP18]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP15]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP19]])
@@ -50,9 +48,9 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -109,11 +107,10 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
@@ -124,11 +121,10 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD3]], splat (i64 128)
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[TMP24]], i32 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[TMP24]], i32 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP24]], i32 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP24]], i32 12
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD]], ptr [[TMP28]], i32 4, <4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD]], ptr [[TMP24]], i32 4, <4 x i1> [[TMP16]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD1]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP17]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD2]], ptr [[TMP30]], i32 4, <4 x i1> [[TMP18]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD3]], ptr [[TMP31]], i32 4, <4 x i1> [[TMP19]])
@@ -136,9 +132,9 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index 93880f5..b0ae40c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -19,11 +19,9 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_SDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP3]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
 ; CHECK-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; CHECK:       pred.sdiv.if:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0
@@ -61,11 +59,9 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE14:%.*]] ]
 ; SINK-GATHER-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_SDIV_CONTINUE14]] ]
 ; SINK-GATHER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; SINK-GATHER-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; SINK-GATHER-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4
+; SINK-GATHER-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; SINK-GATHER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
-; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
+; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
 ; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.sdiv.if:
 ; SINK-GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0
@@ -138,10 +134,10 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SINK-GATHER:       middle.block:
 ; SINK-GATHER-NEXT:    [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP47]])
-; SINK-GATHER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-GATHER-NEXT:    br label [[FOR_END:%.*]]
 ; SINK-GATHER:       scalar.ph:
-; SINK-GATHER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-GATHER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-GATHER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; SINK-GATHER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; SINK-GATHER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-GATHER:       for.body:
 ; SINK-GATHER-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
index 33edaf2..4f33b5b 100644
--- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
+++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
@@ -17,7 +17,7 @@ define i64 @pr88297() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -30,9 +30,9 @@ define i64 @pr88297() {
 ; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 1, [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[R_LCSSA]]
 ;
 entry:
@@ -68,7 +68,7 @@ define i64 @pr88297_incoming_ops_reordered() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -81,9 +81,9 @@ define i64 @pr88297_incoming_ops_reordered() {
 ; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 1, [[THEN]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[R_LCSSA]]
 ;
 entry:
@@ -115,7 +115,6 @@ define i64 @invar_cond(i1 %c) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -123,8 +122,7 @@ define i64 @invar_cond(i1 %c) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -137,9 +135,9 @@ define i64 @invar_cond(i1 %c) {
 ; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 1, [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[R_LCSSA]]
 ;
 entry:
@@ -171,7 +169,6 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> splat (i64 1)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -179,8 +176,7 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -193,9 +189,9 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 1, [[THEN]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[R_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index 8397b1e..c95ec0d 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -37,7 +37,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index 4074194..d40fd22 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -19,12 +19,12 @@ define void @_Z3foov() {
 ; CHECK:  vector.body:
 ; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:  middle.block:
-; CHECK:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:  scalar.ph:
 ; CHECK:    br label [[FOR_BODY:%.*]]
 ; CHECK:  for.cond.cleanup:
 ; CHECK:  for.body:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
 ;
 ; CHECK-MASKED-LABEL: @_Z3foov(
 ; CHECK-MASKED:  entry:
@@ -34,19 +34,22 @@ define void @_Z3foov() {
 ; CHECK-MASKED:  vector.body:
 ; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-MASKED:  middle.block:
-; CHECK-MASKED:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-MASKED:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK-MASKED:  scalar.ph:
 ; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
 ; CHECK-MASKED:  for.cond.cleanup:
 ; CHECK-MASKED:  for.body:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
 ;
 ; CHECK-SCALABLE-LABEL: @_Z3foov(
 ; CHECK-SCALABLE:  entry:
 ; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9:%.*]], i64 0
+; CHECK-SCALABLE:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
 ; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-SCALABLE:  middle.block:
 ; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
@@ -85,14 +88,14 @@ define void @_Z3foo2v() {
 ; CHECK:  vector.ph:
 ; CHECK:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:  vector.body:
-; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:  middle.block:
-; CHECK:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK:    br label [[SCALAR_PH]]
 ; CHECK:  scalar.ph:
 ; CHECK:    br label [[FOR_BODY:%.*]]
 ; CHECK:  for.cond.cleanup:
 ; CHECK:  for.body:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 ; CHECK-MASKED-LABEL: @_Z3foo2v(
 ; CHECK-MASKED:  entry:
@@ -100,21 +103,24 @@ define void @_Z3foo2v() {
 ; CHECK-MASKED:  vector.ph:
 ; CHECK-MASKED:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MASKED:  vector.body:
-; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-MASKED:  middle.block:
-; CHECK-MASKED:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK-MASKED:    br label [[SCALAR_PH]]
 ; CHECK-MASKED:  scalar.ph:
 ; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
 ; CHECK-MASKED:  for.cond.cleanup:
 ; CHECK-MASKED:  for.body:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 ; CHECK-SCALABLE-LABEL: @_Z3foo2v(
 ; CHECK-SCALABLE:  entry:
 ; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
 ; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9:%.*]], i64 0
+; CHECK-SCALABLE:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
 ; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-SCALABLE:  middle.block:
 ; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
index e6eaa0e..af528ee 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
@@ -18,9 +18,9 @@ define void @test(ptr %data) {
 ; CHECK-NEXT:    store i16 [[TMP4]], ptr [[DATA]], align 2, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
index df926fa..cfd36bf 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
@@ -15,9 +15,9 @@ define void @const_fold_ptradd(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -70,9 +70,9 @@ define void @const_fold_inbounds_ptradd(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -126,9 +126,9 @@ define void @const_fold_select(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -181,9 +181,9 @@ define void @const_fold_add_sub_mul_ashr_lshr(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -244,9 +244,9 @@ define void @const_fold_and_or_xor(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -303,9 +303,9 @@ define void @const_fold_cmp_zext(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -360,9 +360,9 @@ define void @const_fold_trunc(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
index 71273a3..e9c7f75 100644
--- a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
+++ b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
@@ -44,8 +44,6 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) {
 ; CHECK:       L2.Inner.header.preheader:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 12, [[INDUCTION_IV_LCSSA1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 1, [[TMP3]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[L1_EXIT_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[INDUCTION_IV_LCSSA1]], i64 0
@@ -63,17 +61,16 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i32> [[TMP7]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[L2_HEADER_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[L2_HEADER_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[L2_INNER_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 13, [[MIDDLE_BLOCK]] ], [ 1, [[L2_INNER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[L2_INNER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 1, [[L2_INNER_HEADER_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[L2_INNER_HEADER:%.*]]
 ; CHECK:       L2.Inner.header:
 ; CHECK-NEXT:    [[L2_ACCUM:%.*]] = phi i32 [ [[L2_ACCUM_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
index ff922d4..baa967c 100644
--- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
@@ -30,8 +30,8 @@ define void @foo(ptr %h) !dbg !4 {
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]], !dbg [[DBG28]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG222]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20, !dbg [[DBG222]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG222]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20, !dbg [[DBG21]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG21]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG21]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
index 653baf83..42d45bd 100644
--- a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
@@ -25,9 +25,8 @@ define i64 @dead_instructions_01(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -45,16 +44,16 @@ define i64 @dead_instructions_01(ptr %a, i64 %n) {
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[R:%.*]] = phi i64 [ [[TMP5:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[R:%.*]] = phi i64 [ [[TMP6:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP5]] = add i64 [[TMP4]], [[R]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP6]] = add i64 [[TMP5]], [[R]]
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ [[TMP5]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[TMP6]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[TMP7]]
 ;
 entry:
   br label %for.body
@@ -149,23 +148,22 @@ define void @dead_load_and_vector_pointer(ptr %a, ptr %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 8, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP5]], align 8, !alias.scope [[META6]], !noalias [[META9]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD2]], splat (i32 1)
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP4]], align 4, !alias.scope [[META6]], !noalias [[META9]]
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP2]], align 4, !alias.scope [[META6]], !noalias [[META9]]
 ; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP5]], align 4, !alias.scope [[META6]], !noalias [[META9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[EXIT]]:
+; CHECK:       [[EXIT:.*]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[PRIMARY:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PRIMARY_ADD:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
index 913a1341..9ade6e9 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
@@ -14,17 +14,17 @@ define i32 @foo(ptr %p) {
 ; CHECK-NEXT:    store i8 0, ptr [[P]], align 1, !dbg [[DBG3:![0-9]+]]
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG7:![0-9]+]]
+; CHECK-NEXT:    br label %[[EXIT:.*]], !dbg [[DBG3]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], !dbg [[DBG7:![0-9]+]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG8]]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG7]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG8:![0-9]+]]
 ; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG11:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG7]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11:![0-9]+]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -60,16 +60,16 @@ exit:                              ; preds = %loop
 !11 = !{}
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}})
+; CHECK: [[META1]] = !DIFile(filename: "{{.*}}test.cpp", directory: {{.*}})
 ; CHECK: [[DBG3]] = !DILocation(line: 6, scope: [[META4:![0-9]+]])
 ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
 ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
 ; CHECK: [[META6]] = !{}
-; CHECK: [[DBG7]] = !DILocation(line: 9, scope: [[META4]])
-; CHECK: [[DBG8]] = !DILocation(line: 4, scope: [[META4]])
-; CHECK: [[DBG9]] = !DILocation(line: 5, scope: [[META4]])
-; CHECK: [[DBG10]] = !DILocation(line: 7, scope: [[META4]])
-; CHECK: [[DBG11]] = !DILocation(line: 8, scope: [[META4]])
+; CHECK: [[DBG7]] = !DILocation(line: 4, scope: [[META4]])
+; CHECK: [[DBG8]] = !DILocation(line: 5, scope: [[META4]])
+; CHECK: [[DBG9]] = !DILocation(line: 7, scope: [[META4]])
+; CHECK: [[DBG10]] = !DILocation(line: 8, scope: [[META4]])
+; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]])
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META13:![0-9]+]], [[META14:![0-9]+]]}
 ; CHECK: [[META13]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[META14]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll
index e7224ae..40cd6b6 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll
@@ -14,9 +14,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK:   load <2 x i32>, ptr {{.*}}, !dbg ![[LOC1]]
 ; CHECK:   add <2 x i32> {{.*}}, !dbg ![[LOC1]]
 ; CHECK:   add nuw i64 %index, 2, !dbg ![[LOC1]]
-; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[LOC1]]
+; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[BR_LOC:[0-9]+]]
 ; CHECK: middle.block
-; CHECK:   call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %{{.*}}), !dbg ![[BR_LOC:[0-9]+]]
+; CHECK:   call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %{{.*}}), !dbg ![[BR_LOC]]
 ; CHECK: for.body
 ; CHECK: br i1{{.*}}, label %for.body,{{.*}}, !dbg ![[BR_LOC]],
 
@@ -141,12 +141,10 @@ define void @test_misc(ptr nocapture %a, ptr noalias %b, i64 %size) !dbg !35 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr %a, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr %b, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <2 x i32> [[WIDE_LOAD]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> zeroinitializer, !dbg [[LOC6:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0, !dbg [[LOC7:![0-9]+]]
-; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[TMP6]], align 4, !dbg [[LOC7]]
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[TMP2]], align 4, !dbg [[LOC7:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]],
 ; CHECK-NEXT:    br i1 [[TMP7]], label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 7a54519..373c8e0 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -19,8 +19,7 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
@@ -43,16 +42,15 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -114,8 +112,7 @@ define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
@@ -123,15 +120,14 @@ define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -195,8 +191,7 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 2) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
@@ -219,16 +214,15 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -295,8 +289,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
@@ -319,16 +312,15 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -395,8 +387,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
@@ -419,16 +410,15 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -495,8 +485,7 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
@@ -519,16 +508,15 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -590,8 +578,7 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
@@ -615,16 +602,15 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP27]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP26]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -687,8 +673,7 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
@@ -717,16 +702,15 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP29]], i64 4), "dereferenceable"(ptr [[TMP19]], i64 4) ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -796,8 +780,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
@@ -820,8 +803,7 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -892,23 +874,20 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -969,8 +948,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
@@ -995,15 +973,14 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -1064,23 +1041,20 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -1142,23 +1116,20 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -1219,8 +1190,7 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
@@ -1245,15 +1215,14 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -1314,8 +1283,7 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
@@ -1340,15 +1308,14 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -1411,8 +1378,7 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
@@ -1433,15 +1399,14 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -1504,8 +1469,7 @@ define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_c
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
@@ -1526,15 +1490,14 @@ define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_c
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
index e771c40..c8cf2ad 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
@@ -19,15 +19,12 @@ define void @deref_assumption_in_preheader_non_constant_trip_count_access_i8(ptr
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i8> [[WIDE_LOAD]], <2 x i8> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[PREDPHI]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store <2 x i8> [[PREDPHI]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -102,15 +99,12 @@ define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32(pt
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -185,8 +179,7 @@ define void @deref_assumption_in_preheader_too_small_non_constant_trip_count_acc
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
@@ -211,8 +204,7 @@ define void @deref_assumption_in_preheader_too_small_non_constant_trip_count_acc
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -285,8 +277,7 @@ define void @deref_assumption_in_preheader_too_small2_non_constant_trip_count_ac
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP15]], i32 0
@@ -311,8 +302,7 @@ define void @deref_assumption_in_preheader_too_small2_non_constant_trip_count_ac
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -387,15 +377,12 @@ define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32_al
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -470,8 +457,7 @@ define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32_al
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
@@ -496,8 +482,7 @@ define void @deref_assumption_in_preheader_non_constant_trip_count_access_i32_al
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
index 3c556e3d..4f95bdd 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
@@ -15,19 +15,18 @@ define dso_local void @constTC(ptr noalias nocapture %A) optsize {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4
-; CHECK-NEXT:    store <2 x i32> splat (i32 13), ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <2 x i32> splat (i32 13), ptr [[TMP3]], align 1
 ; CHECK-NEXT:    store <2 x i32> splat (i32 13), ptr [[TMP7]], align 1
 ; CHECK-NEXT:    store <2 x i32> splat (i32 13), ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1800, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
index aa8299b..1936b40 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
@@ -19,8 +19,7 @@ define dso_local void @alignTC(ptr noalias nocapture %A, i32 %n) optsize {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 13), ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <4 x i32> splat (i32 13), ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -83,8 +82,7 @@ define dso_local void @assumeAlignedTC(ptr noalias nocapture %A, i32 %p, i32 %q)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 13), ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <4 x i32> splat (i32 13), ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
index 6bf8883..968e107 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
@@ -19,8 +19,7 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
@@ -55,8 +54,7 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD8]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP10]], <4 x i64> [[VEC_IND5]], <4 x i64> [[VEC_PHI7]]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[TMP7]], 4
@@ -70,12 +68,12 @@ define i64 @select_icmp_const(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[L]], 3
@@ -123,8 +121,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
@@ -159,8 +156,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD8]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP11]] = select <4 x i1> [[TMP10]], <4 x i64> [[VEC_IND5]], <4 x i64> [[VEC_PHI7]]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[TMP7]], 4
@@ -174,12 +170,12 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = fcmp fast ueq float [[L]], 3.000000e+00
@@ -233,8 +229,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i8> [ splat (i8 -128), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP7]], <4 x i8> [[VEC_IND]], <4 x i8> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -273,8 +268,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX4]] to i8
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD10]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP17]] = select <4 x i1> [[TMP16]], <4 x i8> [[VEC_IND5]], <4 x i8> [[VEC_PHI7]]
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i32 [[INDEX4]], 4
@@ -288,12 +282,12 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i8 [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 3
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index 6e62ff8..c5ecf86b 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -21,8 +21,7 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -51,8 +50,7 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]]
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
@@ -118,8 +116,7 @@ define i32 @any_of_reduction_epilog_arg_as_start_value(ptr %src, i64 %N, i32 %st
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -148,8 +145,7 @@ define i32 @any_of_reduction_epilog_arg_as_start_value(ptr %src, i64 %N, i32 %st
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]]
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index c101d6a..2d0d30d 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -20,8 +20,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 5, i64 0, i64 0, i64 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -45,8 +44,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[TMP6]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP10]] = add <4 x i64> [[WIDE_LOAD6]], [[VEC_PHI5]]
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]]
@@ -108,8 +106,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -135,8 +132,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI5]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[TMP11]] = select fast <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI5]], <4 x float> [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4
@@ -198,8 +194,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 65535)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i16>
@@ -223,8 +218,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP12]], [[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = and <4 x i32> [[VEC_PHI2]], splat (i32 65535)
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i32 [[INDEX1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i16>, ptr [[TMP16]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i16>, ptr [[TMP15]], align 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = zext <4 x i16> [[WIDE_LOAD3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP14]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
@@ -296,8 +290,7 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 1.500000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x float> [ <float 1.000000e+01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3]] = fadd fast <4 x float> [[VEC_PHI2]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -326,8 +319,7 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x float> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP13]] = fadd fast <4 x float> [[VEC_PHI8]], [[WIDE_LOAD9]]
 ; CHECK-NEXT:    [[TMP14]] = fmul fast <4 x float> [[VEC_PHI7]], [[WIDE_LOAD9]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], 4
@@ -404,8 +396,7 @@ define i32 @reduction_phi_start_val(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -429,8 +420,7 @@ define i32 @reduction_phi_start_val(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11]] = sub <4 x i32> [[VEC_PHI5]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]]
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll
index 587dd88..6384343 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll
@@ -27,10 +27,9 @@ define void @trunc_iv_steps_with_epilogue(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[WIDE_LOAD]], splat (i8 2)
-; CHECK-NEXT:    store <4 x i8> [[TMP9]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP9]], ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -50,10 +49,9 @@ define void @trunc_iv_steps_with_epilogue(ptr %A, i64 %N) {
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDEX5]] to i32
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i8> [[WIDE_LOAD6]], splat (i8 2)
-; CHECK-NEXT:    store <4 x i8> [[TMP15]], ptr [[TMP14]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP15]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll b/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll
index 02e92cd..8556ceb 100644
--- a/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll
+++ b/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll
@@ -28,16 +28,15 @@ define void @test(ptr %dst) personality ptr null {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 160
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 160, %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
@@ -50,7 +49,7 @@ define void @test(ptr %dst) personality ptr null {
 ; CHECK-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], [[STEP]]
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV_1]], 161
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[LPAD]]:
diff --git a/llvm/test/Transforms/LoopVectorize/extract-from-end-vector-constant.ll b/llvm/test/Transforms/LoopVectorize/extract-from-end-vector-constant.ll
index f1e3ef0..eb0145e 100644
--- a/llvm/test/Transforms/LoopVectorize/extract-from-end-vector-constant.ll
+++ b/llvm/test/Transforms/LoopVectorize/extract-from-end-vector-constant.ll
@@ -13,13 +13,12 @@ define i64 @exit_value_scalar_live_in(ptr %dst, i64 %in) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP2]], align 2
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
@@ -29,9 +28,9 @@ define i64 @exit_value_scalar_live_in(ptr %dst, i64 %in) {
 ; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT_1]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[IN]], %[[LOOP]] ], [ [[IN]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[IN]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
@@ -60,13 +59,12 @@ define <2 x i64> @exit_value_vector_live_in(ptr %dst) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP2]], align 2
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
@@ -76,9 +74,9 @@ define <2 x i64> @exit_value_vector_live_in(ptr %dst) {
 ; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT_1]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi <2 x i64> [ zeroinitializer, %[[LOOP]] ], [ zeroinitializer, %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi <2 x i64> [ zeroinitializer, %[[LOOP]] ]
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 03c334b..d3c8c13 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -1,37 +1,40 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
 
-define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
+define i16 @test_chained_first_order_recurrences_1(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_1(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT2]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
@@ -42,7 +45,7 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
 ; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -62,7 +65,7 @@ loop:
   %for.1.next = load i16, ptr %gep.ptr, align 2
   %add = add i16 %for.1, %for.2
   store i16 %add, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -70,37 +73,40 @@ exit:
   ret i16 %res
 }
 
-define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
+define i16 @test_chained_first_order_recurrences_2(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_2(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT2]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
@@ -111,7 +117,7 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
 ; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -131,7 +137,7 @@ loop:
   %for.1.next = load i16, ptr %gep.ptr, align 2
   %add = add i16 %for.1, %for.2
   store i16 %add, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -139,12 +145,15 @@ exit:
   ret i16 %res
 }
 
-define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
+define i16 @test_chained_first_order_recurrences_3(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -152,30 +161,30 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT4]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
@@ -188,7 +197,7 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
 ; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -212,7 +221,7 @@ loop:
   %add.1 = add i16 %for.1, %for.2
   %add.2 = add i16 %add.1, %for.3
   store i16 %add.2, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -259,35 +268,38 @@ exit:
   ret void
 }
 
-define void @test_first_order_recurrences_incoming_cycle_preheader(ptr %ptr) {
+define void @test_first_order_recurrences_incoming_cycle_preheader(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define void @test_first_order_recurrences_incoming_cycle_preheader(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP_1:.*]]
 ; CHECK:       [[LOOP_1]]:
 ; CHECK-NEXT:    br i1 true, label %[[LOOP_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_PREHEADER]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP2]], splat (i16 10)
-; CHECK-NEXT:    store <4 x i16> [[TMP3]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP3]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
@@ -297,7 +309,7 @@ define void @test_first_order_recurrences_incoming_cycle_preheader(ptr %ptr) {
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
 ; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], 10
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
@@ -317,19 +329,22 @@ loop:
   %for.1.next = load i16, ptr %gep.ptr, align 2
   %add = add i16 %for.1, 10
   store i16 %add, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
   ret void
 }
 
-define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
+define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_reordered_1(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -337,30 +352,30 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT4]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_2:%.*]], %[[LOOP]] ]
@@ -373,7 +388,7 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
 ; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -397,7 +412,7 @@ loop:
   %add.1 = add i16 %for.1, %for.2
   %add.2 = add i16 %add.1, %for.3
   store i16 %add.2, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -406,12 +421,15 @@ exit:
   ret i16 %res.2
 }
 
-define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
+define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_reordered_2(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -419,30 +437,30 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT4]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
@@ -455,7 +473,7 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
 ; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -479,7 +497,7 @@ loop:
   %add.1 = add i16 %for.1, %for.2
   %add.2 = add i16 %add.1, %for.3
   store i16 %add.2, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -488,12 +506,15 @@ exit:
   ret i16 %res.2
 }
 
-define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr) {
+define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -501,30 +522,30 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr)
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT4]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
@@ -537,7 +558,7 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr)
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], 10
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
 ; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -561,7 +582,7 @@ loop:
   %add.1 = add i16 %for.1, 10
   %add.2 = add i16 %add.1, %for.3
   store i16 %add.2, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -570,12 +591,15 @@ exit:
   ret i16 %res.2
 }
 
-define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %ptr) {
+define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -583,29 +607,29 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP4]], splat (i16 10)
-; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT4]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
@@ -617,7 +641,7 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_3]], 10
 ; CHECK-NEXT:    store i16 [[ADD_1]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -640,7 +664,7 @@ loop:
   %for.1.next = load i16, ptr %gep.ptr, align 2
   %add.1 = add i16 %for.3, 10
   store i16 %add.1, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -649,57 +673,51 @@ exit:
   ret i16 %res.2
 }
 
-define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) {
+define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define double @test_chained_first_order_recurrence_sink_users_1(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 4
+; CHECK-NEXT:    [[INDEX:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 1.000000e+01>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 2.000000e+01>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[PTR]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[INDEX]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 1.000000e+01, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 2.000000e+01, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[OFFSET_IDX]], %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[FOR_1:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR_2:%.*]] = phi double [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi double [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_2_LCSSA]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double 1.000000e+01, [[FOR_2]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd double [[ADD_1]], [[FOR_1]]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = fadd double 1.000000e+01, [[FOR_2]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds double, ptr [[PTR]], i64 [[IV]]
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = load double, ptr [[GEP_PTR]], align 8
-; CHECK-NEXT:    store double [[ADD_2]], ptr [[GEP_PTR]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    store double [[RES]], ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[RES:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-NEXT:    [[FOR_1_LCSSA1:%.*]] = phi double [ [[FOR_2_LCSSA]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA1:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES1:%.*]] = fadd double [[FOR_1_LCSSA1]], [[FOR_2_LCSSA1]]
+; CHECK-NEXT:    ret double [[RES1]]
 ;
 entry:
   br label %loop
@@ -714,7 +732,7 @@ loop:
   %gep.ptr = getelementptr inbounds double, ptr %ptr, i64 %iv
   %for.1.next  = load double, ptr %gep.ptr, align 8
   store double %add.2, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -722,9 +740,9 @@ exit:
   ret double %res
 }
 
-define void @test_first_order_recurrences_and_reduction(ptr %ptr) {
+define void @test_first_order_recurrences_and_reduction(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define void @test_first_order_recurrences_and_reduction(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
@@ -738,7 +756,7 @@ define void @test_first_order_recurrences_and_reduction(ptr %ptr) {
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], 10
 ; CHECK-NEXT:    [[RED_NEXT]] = add i16 [[RED]], [[LV]]
 ; CHECK-NEXT:    store i16 [[ADD_1]], ptr [[GEP_PTR]], align 2
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
@@ -757,19 +775,22 @@ loop:
   %add.1 = add i16 %for.1, 10
   %red.next = add i16 %red, %lv
   store i16 %add.1, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
   ret void
 }
 
-define i64 @test_first_order_recurrences_and_induction(ptr %ptr) {
+define i64 @test_first_order_recurrences_and_induction(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i64 @test_first_order_recurrences_and_induction(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -778,19 +799,19 @@ define i64 @test_first_order_recurrences_and_induction(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV:%.*]], %[[LOOP]] ]
@@ -799,7 +820,7 @@ define i64 @test_first_order_recurrences_and_induction(ptr %ptr) {
 ; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[FOR_1]], 10
 ; CHECK-NEXT:    store i64 [[ADD_1]], ptr [[GEP_PTR]], align 4
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i64 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -816,7 +837,7 @@ loop:
   %gep.ptr = getelementptr inbounds i64, ptr %ptr, i64 %iv
   %add.1 = add i64 %for.1, 10
   store i64 %add.1, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -825,12 +846,15 @@ exit:
 
 ; Same as @test_first_order_recurrences_and_induction but with order of phis
 ; flipped.
-define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) {
+define i64 @test_first_order_recurrences_and_induction2(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define i64 @test_first_order_recurrences_and_induction2(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -839,18 +863,18 @@ define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
@@ -860,7 +884,7 @@ define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) {
 ; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[FOR_1]], 10
 ; CHECK-NEXT:    store i64 [[ADD_1]], ptr [[GEP_PTR]], align 4
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i64 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -876,20 +900,24 @@ loop:
   %gep.ptr = getelementptr inbounds i64, ptr %ptr, i64 %iv
   %add.1 = add i64 %for.1, 10
   store i64 %add.1, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
   ret i64 %for.1
 }
 
-define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
+define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define ptr @test_first_order_recurrences_and_pointer_induction1(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 4000
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP2]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -898,21 +926,21 @@ define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ null, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -923,7 +951,7 @@ define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
 ; CHECK-NEXT:    store ptr [[PTR_IV]], ptr [[GEP_PTR]], align 8
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i32, ptr [[PTR_IV]], i64 1
 ; CHECK-NEXT:    store ptr [[FOR_1]], ptr [[GEP_PTR]], align 8
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi ptr [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
@@ -941,7 +969,7 @@ loop:
   store ptr %ptr.iv, ptr %gep.ptr
   %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
   store ptr %for.1, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -950,13 +978,17 @@ exit:
 
 ; same as @test_first_order_recurrences_and_pointer_induction1 but with order
 ; of phis flipped.
-define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
+define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr, i64 %n) {
 ; CHECK-LABEL: define ptr @test_first_order_recurrences_and_pointer_induction2(
-; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 4000
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP0]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
@@ -965,20 +997,20 @@ define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_GEP]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[VECTOR_GEP]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1000
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ null, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
@@ -990,8 +1022,8 @@ define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
 ; CHECK-NEXT:    store ptr [[PTR_IV]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i32, ptr [[PTR_IV]], i64 1
 ; CHECK-NEXT:    store ptr [[FOR_1]], ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi ptr [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret ptr [[FOR_1_LCSSA]]
@@ -1008,7 +1040,7 @@ loop:
   store ptr %ptr.iv, ptr %gep.ptr
   %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
   store ptr %for.1, ptr %gep.ptr
-  %exitcond.not = icmp eq i64 %iv.next, 1000
+  %exitcond.not = icmp eq i64 %iv.next, %n
   br i1 %exitcond.not, label %exit, label %loop
 
 exit:
@@ -1017,12 +1049,15 @@ exit:
 
 ; In this test case, %USE_2_FORS uses 2 different fixed-order recurrences and
 ; it needs to be sunk past the previous value for both recurrences.
-define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
+define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b, i64 %n) {
 ; CHECK-LABEL: define double @test_resinking_required(
-; CHECK-SAME: ptr [[P:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-SAME: ptr [[P:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1043,17 +1078,18 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP8]], i32 3
 ; CHECK-NEXT:    store double [[TMP6]], ptr [[P]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[END:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT5:%.*]] = phi double [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR_1:%.*]] = phi double [ [[L1:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
@@ -1067,7 +1103,7 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[L2]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[DIV]], [[FOR_3]]
 ; CHECK-NEXT:    store double [[ADD]], ptr [[P]], align 8
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
@@ -1092,7 +1128,7 @@ Loop:
   %l2 = load double, ptr %b, align 8
   %add = fadd double %div, %for.3
   store double %add, ptr %p, align 8
-  %cond = icmp eq i64 %iv.next, 1000
+  %cond = icmp eq i64 %iv.next, %n
   br i1 %cond, label %End, label %Loop
 
 End:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index ad3c7cb..74df675 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -25,20 +25,18 @@ define void @can_sink_after_store(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [257 x i32], ptr @p, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [257 x i32], ptr @q, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1996
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
@@ -54,7 +52,7 @@ define void @can_sink_after_store(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr
 ; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[IDX_2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -104,20 +102,18 @@ define void @sink_sdiv(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [257 x i32], ptr @p, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = sdiv <4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [257 x i32], ptr @q, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1996
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
@@ -133,7 +129,7 @@ define void @sink_sdiv(i32 %x, ptr %ptr, i64 %tc) local_unnamed_addr #0 {
 ; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[IDX_2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -182,22 +178,20 @@ define void @can_sink_with_additional_user(i32 %x, ptr %ptr, i64 %tc) {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [257 x i32], ptr @p, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [257 x i32], ptr @q, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1996
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
@@ -215,7 +209,7 @@ define void @can_sink_with_additional_user(i32 %x, ptr %ptr, i64 %tc) {
 ; CHECK-NEXT:    store i32 [[ADD_4]], ptr [[IDX_2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -375,13 +369,12 @@ define void @instruction_with_2_FOR_operands(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <4 x float> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[BB74:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[BB]] ]
@@ -398,7 +391,7 @@ define void @instruction_with_2_FOR_operands(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP60]] = load float, ptr [[B]], align 4
 ; CHECK-NEXT:    store float [[TMP38]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[BB13]], label [[BB74]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[BB13]], label [[BB74:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       bb74:
 ; CHECK-NEXT:    ret void
 ;
@@ -444,19 +437,16 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses(ptr noalias
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <4 x float> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP4]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DST_1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[DST_2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[DST_3:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
@@ -479,7 +469,7 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses(ptr noalias
 ; CHECK-NEXT:    [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST_3]], i64 [[IV]]
 ; CHECK-NEXT:    store float [[FOR_1_USE_3]], ptr [[GEP_DST_3]], align 4
 ; CHECK-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -534,19 +524,16 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses_chain(ptr n
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <4 x float> [[TMP4]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[DST_1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DST_2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[DST_3:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi float [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[BB]] ]
@@ -570,7 +557,7 @@ define void @instruction_with_2_FOR_operands_and_multiple_other_uses_chain(ptr n
 ; CHECK-NEXT:    [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST_3]], i64 [[IV]]
 ; CHECK-NEXT:    store float [[FOR_1_USE_3]], ptr [[GEP_DST_3]], align 4
 ; CHECK-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -678,10 +665,9 @@ define i16 @multiple_exit(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -753,10 +739,9 @@ define i16 @multiple_exit2(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -827,14 +812,13 @@ define void @sink_dominance(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], splat (i32 213)
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> splat (i32 22)
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -907,8 +891,7 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32>
@@ -916,7 +899,7 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP8]], splat (i32 99)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP7]], splat (i32 213)
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]]
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -1031,20 +1014,19 @@ define void @test_for_sink_instruction_after_same_incoming_1(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 2.000000e+01>, [[VECTOR_PH]] ], [ [[WIDE_LOAD]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[PTR:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ]
@@ -1061,7 +1043,7 @@ define void @test_for_sink_instruction_after_same_incoming_1(ptr %ptr) {
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = load double, ptr [[GEP_PTR]], align 8
 ; CHECK-NEXT:    store double [[ADD_2]], ptr [[GEP_PTR]], align 8
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1098,20 +1080,19 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) {
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 1.000000e+01>, [[VECTOR_PH]] ], [ [[WIDE_LOAD]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[PTR:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ]
@@ -1128,7 +1109,7 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) {
 ; CHECK-NEXT:    [[FOR_1_NEXT]] = load double, ptr [[GEP_PTR]], align 8
 ; CHECK-NEXT:    store double [[ADD_2]], ptr [[GEP_PTR]], align 8
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
index 86171e6..71c2da2 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
@@ -6,42 +6,18 @@ define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(i8 %fo
 ; CHECK-LABEL: define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(
 ; CHECK-SAME: i8 [[FOR_START:%.*]], ptr [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
-; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[FOR_START]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLAT]], <4 x i8> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
-; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP2]], align 1
-; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], -8
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -7, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ], [ 1, %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[FOR_START]], %[[MIDDLE_BLOCK]] ], [ [[FOR_START]], %[[ENTRY]] ], [ [[FOR_START]], %[[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i8 [ [[FOR_START]], %[[ENTRY]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[FOR_NEXT]] = and i8 [[FOR_START]], -1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
 ; CHECK-NEXT:    store i8 [[FOR]], ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_NEXT_LCSSA:%.*]] = phi i8 [ [[FOR_NEXT]], %[[LOOP]] ], [ [[FOR_START]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_NEXT_LCSSA:%.*]] = phi i8 [ [[FOR_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i8 [[FOR_NEXT_LCSSA]]
 ;
 entry:
@@ -79,21 +55,19 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i16> [[TMP0]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i32 4
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -108,7 +82,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
 ; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
 ; CHECK-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; CHECK-NEXT:    br i1 [[VEC_DEAD]], label %[[FOR_END]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[VEC_DEAD]], label %[[FOR_END]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
@@ -162,18 +136,17 @@ define void @sink_dead_inst(ptr %a) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i16> [[TMP5]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP9]], i32 4
-; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -190,7 +163,7 @@ define void @sink_dead_inst(ptr %a) {
 ; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
 ; CHECK-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -232,10 +205,10 @@ define void @unused_recurrence(ptr %a) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -247,7 +220,7 @@ define void @unused_recurrence(ptr %a) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
index 53113b2..d95c487 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
@@ -7,7 +7,6 @@ define float @for_load_interleave_only(ptr %src) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 16000
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -16,18 +15,17 @@ define float @for_load_interleave_only(ptr %src) {
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1001, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
index 05eaae5..715ea1c 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
@@ -114,17 +114,16 @@ define void @test_pr54223_sink_after_insertion_order(ptr noalias %a, ptr noalias
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR1]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP6]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT5:%.*]] = phi float [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -235,14 +234,13 @@ define void @test_pr54233_for_depend_on_each_other(ptr noalias %a, ptr noalias %
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i32> [[TMP7]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -262,7 +260,7 @@ define void @test_pr54233_for_depend_on_each_other(ptr noalias %a, ptr noalias %
 ; CHECK-NEXT:    store i32 [[AND]], ptr [[A_GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -398,13 +396,12 @@ define void @hoist_previous_value_and_operand(ptr %dst, i64 %mask) {
 ; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6]] = or <4 x i32> [[TMP5]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR1]], <4 x i32> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 336
@@ -412,7 +409,7 @@ define void @hoist_previous_value_and_operand(ptr %dst, i64 %mask) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 337, [[MIDDLE_BLOCK]] ], [ 1, [[BB:%.*]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[BB]] ]
@@ -429,7 +426,7 @@ define void @hoist_previous_value_and_operand(ptr %dst, i64 %mask) {
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[IV]], 337
 ; CHECK-NEXT:    [[A:%.*]] = and i64 [[IV]], [[MASK]]
 ; CHECK-NEXT:    [[TRUNC]] = trunc i64 [[A]] to i32
-; CHECK-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
index f4044a7..2bafa6c 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
@@ -24,8 +24,7 @@ define i64 @pr97452_scalable_vf1_for_live_out(ptr %src) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -93,12 +92,10 @@ define void @pr97452_scalable_vf1_for_no_live_out(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index ff5c29f..3adfcf5 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -33,18 +33,16 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 4
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[TMP9]]
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP10]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP14]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -147,13 +145,11 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) {
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP6]], align 4
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP7]]
-; SINK-AFTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP10]], align 4
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP8]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -235,9 +231,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ poison, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ poison, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -375,8 +370,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ poison, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP3]], align 4
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub nsw <4 x i32> [[WIDE_LOAD]], [[TMP4]]
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp sgt <4 x i32> [[TMP5]], zeroinitializer
@@ -491,9 +485,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 4
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP6]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -506,9 +499,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = fsub fast <4 x double> [[TMP9]], [[TMP13]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = fsub fast <4 x double> [[TMP10]], [[TMP14]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP17]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP17]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP15]], ptr [[TMP18]], align 8
+; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP15]], ptr [[TMP17]], align 8
 ; UNROLL-NO-IC-NEXT:    store <4 x double> [[TMP16]], ptr [[TMP19]], align 8
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -642,16 +634,14 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP5]], align 2
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP6]] to <4 x double>
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = fmul fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT]]
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = fsub fast <4 x double> [[TMP7]], [[TMP9]]
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i32 0
-; SINK-AFTER-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP12]], align 8
+; SINK-AFTER-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP11]], align 8
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -1203,10 +1193,10 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1230,10 +1220,10 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1257,10 +1247,10 @@ define i64 @constant_folded_previous_value() {
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER-NEXT:    br label [[FOR_END:%.*]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1294,11 +1284,15 @@ for.end:
 ; the first order recurrence phi is used outside the loop, so we require the phi
 ; itself and not its update (addx).
 ; Check the case when unrolled but not vectorized.
-define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
+define i32 @extract_second_last_iteration(ptr %cval, i32 %x, i32 %n)  {
 ; UNROLL-NO-IC-LABEL: @extract_second_last_iteration(
 ; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add i32 [[N:%.*]], 1
+; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 8
+; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 8
+; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1309,14 +1303,15 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
@@ -1325,7 +1320,7 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-IC-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
 ; UNROLL-NO-IC-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
 ; UNROLL-NO-IC-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; UNROLL-NO-IC:       for.end:
 ; UNROLL-NO-IC-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
@@ -1333,21 +1328,26 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ;
 ; UNROLL-NO-VF-LABEL: @extract_second_last_iteration(
 ; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[N:%.*]], 1
+; UNROLL-NO-VF-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2
+; UNROLL-NO-VF-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
+; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = add i32 [[VECTOR_RECUR]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i32 [[TMP6]], [[X]]
 ; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
@@ -1356,7 +1356,7 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
 ; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], [[N]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
 ; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
@@ -1364,8 +1364,12 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ;
 ; SINK-AFTER-LABEL: @extract_second_last_iteration(
 ; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add i32 [[N:%.*]], 1
+; SINK-AFTER-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 4
+; SINK-AFTER-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 4
+; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; SINK-AFTER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1375,14 +1379,15 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; SINK-AFTER-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
 ; SINK-AFTER:       for.body:
@@ -1391,7 +1396,7 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; SINK-AFTER-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
 ; SINK-AFTER-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
 ; SINK-AFTER-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], [[N]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; SINK-AFTER:       for.end:
 ; SINK-AFTER-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
@@ -1406,7 +1411,7 @@ for.body:
   %inc = add i32 %inc.phi, 1
   %bc = zext i32 %inc.phi to i64
   %addx = add i32 %inc.phi, %x
-  %cmp = icmp eq i32 %inc.phi, 95
+  %cmp = icmp eq i32 %inc.phi, %n
   br i1 %cmp, label %for.end, label %for.body
 
 for.end:
@@ -1431,13 +1436,21 @@ for.end:
 ; }
 ;
 ;
-define i32 @PR33613(ptr %b, double %j, i32 %d) {
+define i32 @PR33613(ptr %b, double %j, i32 %d, i32 %n) {
 ; UNROLL-NO-IC-LABEL: @PR33613(
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    [[IDXPROM:%.*]] = sext i32 [[D:%.*]] to i64
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = add i32 [[N:%.*]], -1
+; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = zext i32 [[TMP42]] to i64
+; UNROLL-NO-IC-NEXT:    [[TMP46:%.*]] = add nuw nsw i64 [[TMP45]], 1
+; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP46]], 8
+; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2048000
+; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP46]], 8
+; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP46]], [[N_MOD_VF]]
+; UNROLL-NO-IC-NEXT:    [[TMP47:%.*]] = mul i64 [[N_VEC]], 200
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP47]]
+; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x double> poison, double [[J:%.*]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
@@ -1497,15 +1510,16 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
 ; UNROLL-NO-IC-NEXT:    [[TMP40]] = add <4 x i32> [[VEC_PHI]], [[TMP38]]
 ; UNROLL-NO-IC-NEXT:    [[TMP41]] = add <4 x i32> [[VEC_PHI2]], [[TMP39]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP41]], [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP46]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
@@ -1525,15 +1539,23 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
 ; UNROLL-NO-IC-NEXT:    [[A_1]] = add nsw i32 [[A_010]], [[INC]]
 ; UNROLL-NO-IC-NEXT:    [[INC1]] = add nuw nsw i32 [[I_011]], 1
 ; UNROLL-NO-IC-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, ptr [[B_ADDR_012]], i64 25
-; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240
+; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 ; UNROLL-NO-VF-LABEL: @PR33613(
 ; UNROLL-NO-VF-NEXT:  entry:
 ; UNROLL-NO-VF-NEXT:    [[IDXPROM:%.*]] = sext i32 [[D:%.*]] to i64
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
+; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP0]] to i64
+; UNROLL-NO-VF-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[TMP14]], 1
+; UNROLL-NO-VF-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP16]], 2
+; UNROLL-NO-VF-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2048000
+; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP16]], 2
+; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP16]], [[N_MOD_VF]]
+; UNROLL-NO-VF-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC]], 200
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP17]]
+; UNROLL-NO-VF-NEXT:    [[TMP18:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -1557,14 +1579,15 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
 ; UNROLL-NO-VF-NEXT:    [[TMP12]] = add i32 [[VEC_PHI]], [[TMP10]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13]] = add i32 [[VEC_PHI2]], [[TMP11]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]]
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP16]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
@@ -1584,15 +1607,23 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
 ; UNROLL-NO-VF-NEXT:    [[A_1]] = add nsw i32 [[A_010]], [[INC]]
 ; UNROLL-NO-VF-NEXT:    [[INC1]] = add nuw nsw i32 [[I_011]], 1
 ; UNROLL-NO-VF-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, ptr [[B_ADDR_012]], i64 25
-; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240
+; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], [[N]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 ; SINK-AFTER-LABEL: @PR33613(
 ; SINK-AFTER-NEXT:  entry:
 ; SINK-AFTER-NEXT:    [[IDXPROM:%.*]] = sext i32 [[D:%.*]] to i64
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER-NEXT:    [[TMP21:%.*]] = add i32 [[N:%.*]], -1
+; SINK-AFTER-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP21]] to i64
+; SINK-AFTER-NEXT:    [[TMP25:%.*]] = add nuw nsw i64 [[TMP24]], 1
+; SINK-AFTER-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP25]], 4
+; SINK-AFTER-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2048000
+; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP25]], 4
+; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP25]], [[N_MOD_VF]]
+; SINK-AFTER-NEXT:    [[TMP26:%.*]] = mul i64 [[N_VEC]], 200
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP26]]
+; SINK-AFTER-NEXT:    [[TMP28:%.*]] = trunc i64 [[N_VEC]] to i32
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x double> poison, double [[J:%.*]], i32 3
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
@@ -1626,14 +1657,15 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = zext <4 x i1> [[TMP18]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP20]] = add <4 x i32> [[VEC_PHI]], [[TMP19]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
-; SINK-AFTER-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]])
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP25]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[J]], [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_BODY:%.*]]
@@ -1653,7 +1685,7 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) {
 ; SINK-AFTER-NEXT:    [[A_1]] = add nsw i32 [[A_010]], [[INC]]
 ; SINK-AFTER-NEXT:    [[INC1]] = add nuw nsw i32 [[I_011]], 1
 ; SINK-AFTER-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, ptr [[B_ADDR_012]], i64 25
-; SINK-AFTER-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240
+; SINK-AFTER-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], [[N]]
 ; SINK-AFTER-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ;
 entry:
@@ -1677,7 +1709,7 @@ for.body:
   %a.1 = add nsw i32 %a.010, %inc
   %inc1 = add nuw nsw i32 %i.011, 1
   %add.ptr = getelementptr inbounds double, ptr %b.addr.012, i64 25
-  %exitcond = icmp eq i32 %inc1, 10240
+  %exitcond = icmp eq i32 %inc1, %n
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
@@ -1704,9 +1736,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 4
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -1717,9 +1748,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP14]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP13]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP15]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1818,15 +1848,13 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP1]]
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP3]], align 2
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP5]]
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1923,9 +1951,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP5]], i64 1
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP6]], i64 1
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP7]], i64 1
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> splat (i32 7), ptr [[TMP17]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> splat (i32 7), ptr [[TMP8]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> splat (i32 7), ptr [[TMP18]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP9]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = load i16, ptr [[TMP10]], align 2
@@ -1952,9 +1979,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP37]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP38]]
 ; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
-; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP44]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP43]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP42]], ptr [[TMP45]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2065,8 +2091,7 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP1]], i64 1
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP2]], i64 1
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP3]], i64 1
-; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> splat (i32 7), ptr [[TMP9]], align 4
+; SINK-AFTER-NEXT:    store <4 x i32> splat (i32 7), ptr [[TMP4]], align 4
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP5]], align 2
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP6]], align 2
 ; SINK-AFTER-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP7]], align 2
@@ -2080,8 +2105,7 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = sext <4 x i16> [[TMP17]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP19]]
 ; SINK-AFTER-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
-; SINK-AFTER-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP21]], ptr [[TMP23]], align 4
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP21]], ptr [[TMP22]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -2158,9 +2182,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 4
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -2173,9 +2196,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP11]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP12]]
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP16]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP15]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP17]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2278,16 +2300,14 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP1]]
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP2]], i32 0
-; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP3]], align 2
+; SINK-AFTER-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], splat (i32 2)
 ; SINK-AFTER-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP7]]
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; SINK-AFTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; SINK-AFTER-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -2465,9 +2485,8 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = sub <4 x i16> [[TMP7]], splat (i16 10)
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP10]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
+; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP10]], align 2
 ; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP12]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
@@ -2476,7 +2495,7 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -2493,7 +2512,7 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
 ; UNROLL-NO-IC-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
 ; UNROLL-NO-IC-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 ; UNROLL-NO-IC:       for.end:
 ; UNROLL-NO-IC-NEXT:    ret void
 ;
@@ -2523,7 +2542,7 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    br label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 15, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -2540,7 +2559,7 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
 ; UNROLL-NO-VF-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
 ; UNROLL-NO-VF-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
 ; UNROLL-NO-VF-NEXT:    ret void
 ;
@@ -2561,8 +2580,7 @@ define void @sink_dead_inst(ptr %a) {
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10)
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; SINK-AFTER-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP6]], align 2
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
@@ -2570,7 +2588,7 @@ define void @sink_dead_inst(ptr %a) {
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER-NEXT:    br label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -2587,7 +2605,7 @@ define void @sink_dead_inst(ptr %a) {
 ; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
 ; SINK-AFTER-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
 ; SINK-AFTER-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 ; SINK-AFTER:       for.end:
 ; SINK-AFTER-NEXT:    ret void
 ;
@@ -3335,7 +3353,7 @@ bb:
 
 ; %vec.dead will be marked as dead instruction in the vector loop and no recipe
 ; will be created for it. Make sure a valid sink target is used.
-define i32 @sink_after_dead_inst(ptr %A.ptr) {
+define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) {
 ; UNROLL-NO-IC-LABEL: @sink_after_dead_inst(
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -3350,21 +3368,19 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -3392,25 +3408,22 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP5]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP2]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
 ; UNROLL-NO-VF-NEXT:    [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-VF:       loop:
 ; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -3427,7 +3440,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
 ; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
 ;
 ; SINK-AFTER-LABEL: @sink_after_dead_inst(
@@ -3443,19 +3456,17 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER-NEXT:    br label [[FOR_END:%.*]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
 ; SINK-AFTER:       loop:
 ; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -3500,11 +3511,19 @@ for.end:
 
 ; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
 ; to be removed also.
-define void @unused_recurrence(ptr %a) {
+define void @unused_recurrence(ptr %a, i16 %n) {
 ; UNROLL-NO-IC-LABEL: @unused_recurrence(
 ; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i16 [[N:%.*]], 27
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP5]], 1
+; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8
+; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 8
+; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i16 -27, [[DOTCAST]]
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -3514,13 +3533,14 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
 ; UNROLL-NO-IC:       for.cond:
@@ -3529,30 +3549,39 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
 ; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], [[N]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-IC:       for.end:
 ; UNROLL-NO-IC-NEXT:    ret void
 ;
 ; UNROLL-NO-VF-LABEL: @unused_recurrence(
 ; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i16 [[N:%.*]], 27
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = add nuw nsw i32 [[TMP5]], 1
+; UNROLL-NO-VF-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 2
+; UNROLL-NO-VF-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP6]], 2
+; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP6]], [[N_MOD_VF]]
+; UNROLL-NO-VF-NEXT:    [[DOTCAST1:%.*]] = trunc i32 [[N_VEC]] to i16
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 -27, [[DOTCAST1]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 5
+; UNROLL-NO-VF-NEXT:    [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST2]]
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = add i16 [[TMP8]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP9]], 5
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
 ; UNROLL-NO-VF:       for.cond:
@@ -3561,30 +3590,39 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
 ; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], [[N]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
 ; UNROLL-NO-VF-NEXT:    ret void
 ;
 ; SINK-AFTER-LABEL: @unused_recurrence(
 ; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i16 [[N:%.*]], 27
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP5]], 1
+; SINK-AFTER-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
+; SINK-AFTER-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
+; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; SINK-AFTER-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i16 -27, [[DOTCAST]]
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP7]], splat (i16 5)
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
 ; SINK-AFTER:       for.cond:
@@ -3593,7 +3631,7 @@ define void @unused_recurrence(ptr %a) {
 ; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
 ; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], [[N]]
 ; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 ; SINK-AFTER:       for.end:
 ; SINK-AFTER-NEXT:    ret void
@@ -3607,7 +3645,7 @@ for.cond:
   %use.rec.1 = sub i16 %rec.1, 10
   %iv.next= add i16 %iv, 1
   %rec.1.prev = add i16 %iv.next, 5
-  %cmp = icmp eq i16 %iv, 1000
+  %cmp = icmp eq i16 %iv, %n
   br i1 %cmp, label %for.end, label %for.cond
 
 for.end:
@@ -3615,42 +3653,59 @@ for.end:
 }
 
 ; Test case for https://github.com/llvm/llvm-project/issues/95520.
-define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
+define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst, i64 %n) {
 ; UNROLL-NO-IC-LABEL: @recurence_uniform_load(
 ; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; UNROLL-NO-IC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; UNROLL-NO-IC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    br label [[EXIT:%.*]]
+; UNROLL-NO-IC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
 ; UNROLL-NO-IC-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ]
 ; UNROLL-NO-IC-NEXT:    [[ADD]] = add i64 [[PHI]], 1
-; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP37:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
+; UNROLL-NO-IC-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
+; UNROLL-NO-IC-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-IC:       exit:
 ; UNROLL-NO-IC-NEXT:    ret i32 0
 ;
 ; UNROLL-NO-VF-LABEL: @recurence_uniform_load(
 ; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i64 [[N:%.*]], 1
+; UNROLL-NO-VF-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
+; UNROLL-NO-VF-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
+; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-VF:       loop:
@@ -3658,31 +3713,40 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-VF-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ]
 ; UNROLL-NO-VF-NEXT:    [[ADD]] = add i64 [[PHI]], 1
 ; UNROLL-NO-VF-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP37:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
+; UNROLL-NO-VF-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; UNROLL-NO-VF:       exit:
 ; UNROLL-NO-VF-NEXT:    ret i32 0
 ;
 ; SINK-AFTER-LABEL: @recurence_uniform_load(
 ; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; SINK-AFTER-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; SINK-AFTER-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SRC:%.*]], align 4
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    br label [[EXIT:%.*]]
+; SINK-AFTER-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
 ; SINK-AFTER:       loop:
 ; SINK-AFTER-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
 ; SINK-AFTER-NEXT:    [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ]
 ; SINK-AFTER-NEXT:    [[ADD]] = add i64 [[PHI]], 1
-; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC:%.*]], align 4
-; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1
-; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP37:![0-9]+]]
+; SINK-AFTER-NEXT:    [[LOAD]] = load i32, ptr [[SRC]], align 4
+; SINK-AFTER-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[PHI]], [[N]]
+; SINK-AFTER-NEXT:    br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]]
 ; SINK-AFTER:       exit:
 ; SINK-AFTER-NEXT:    ret i32 0
 ;
@@ -3694,7 +3758,7 @@ loop:
   %recur = phi i32 [ 0, %entry ], [ %load, %loop ]
   %add = add i64 %phi, 1
   %load = load i32, ptr %src, align 4
-  %icmp = icmp ult i64 %phi, 1
+  %icmp = icmp ult i64 %phi, %n
   br i1 %icmp, label %loop, label %exit
 
 exit:
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index fb6e6be..8a3cad0 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -1655,7 +1655,7 @@ define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; VEC4_INTERL1-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VEC4_INTERL1:       middle.block:
-; VEC4_INTERL1-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC4_INTERL1-NEXT:    br label [[EXIT:%.*]]
 ; VEC4_INTERL1:       scalar.ph:
 ; VEC4_INTERL1-NEXT:    br label [[LOOP:%.*]]
 ; VEC4_INTERL1:       loop:
@@ -1678,7 +1678,7 @@ define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; VEC4_INTERL2-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VEC4_INTERL2:       middle.block:
-; VEC4_INTERL2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC4_INTERL2-NEXT:    br label [[EXIT:%.*]]
 ; VEC4_INTERL2:       scalar.ph:
 ; VEC4_INTERL2-NEXT:    br label [[LOOP:%.*]]
 ; VEC4_INTERL2:       loop:
@@ -1705,7 +1705,7 @@ define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
 ; VEC1_INTERL2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; VEC1_INTERL2-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VEC1_INTERL2:       middle.block:
-; VEC1_INTERL2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC1_INTERL2-NEXT:    br label [[EXIT:%.*]]
 ; VEC1_INTERL2:       scalar.ph:
 ; VEC1_INTERL2-NEXT:    br label [[LOOP:%.*]]
 ; VEC1_INTERL2:       loop:
diff --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
index d01984d..2c02f83 100644
--- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
@@ -58,8 +58,7 @@ define float @minloopattr(ptr nocapture readonly %arg) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[ARG]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -67,10 +66,10 @@ define float @minloopattr(ptr nocapture readonly %arg) #0 {
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP4]])
-; CHECK-NEXT:    br i1 true, label [[OUT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[OUT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TOP:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[T]], [[TOP]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[TOP:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index a2eddad..720ea1f 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -53,9 +53,8 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 1ca5586..3ef37bc 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -202,8 +202,7 @@ define float @fmaxnum_1(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -270,8 +269,7 @@ define float @fmaxnum_2(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -340,8 +338,7 @@ define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 10, [[INDEX]]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -411,8 +408,7 @@ define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[START]], [[INDEX]]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
index 68bc8d0..0f688ab 100644
--- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -202,8 +202,7 @@ define float @fminnum_1(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -270,8 +269,7 @@ define float @fminnum_2(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
index 6374fba..677163b 100644
--- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
+++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
@@ -71,7 +71,7 @@ define dso_local void @forked_ptrs_different_base_same_offset(ptr nocapture read
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/fpsat.ll b/llvm/test/Transforms/LoopVectorize/fpsat.ll
index 84753e4..f20e2c8 100644
--- a/llvm/test/Transforms/LoopVectorize/fpsat.ll
+++ b/llvm/test/Transforms/LoopVectorize/fpsat.ll
@@ -23,12 +23,10 @@ define void @signed(ptr %x, ptr %y, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -98,12 +96,10 @@ define void @unsigned(ptr %x, ptr %y, i32 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
index d717a3f..4811a77d 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -44,14 +44,10 @@ define void @test(ptr nocapture %asd, ptr nocapture %aud,
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[AUD]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ASR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[AUR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META13:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META14:![0-9]+]], !noalias [[META15:![0-9]+]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META15]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META13:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META14:![0-9]+]], !noalias [[META15:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META15]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], splat (i32 23)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[WIDE_LOAD23]], splat (i32 24)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i32> [[WIDE_LOAD24]], splat (i32 25)
@@ -111,14 +107,10 @@ define void @test(ptr nocapture %asd, ptr nocapture %aud,
 ; CHECK-NEXT:    [[PREDPHI28:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[TMP53]], <2 x i32> [[TMP10]]
 ; CHECK-NEXT:    [[PREDPHI29:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[TMP54]], <2 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[PREDPHI30:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[TMP55]], <2 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP57]], align 4, !alias.scope [[META5]], !noalias [[META8]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI28]], ptr [[TMP58]], align 4, !alias.scope [[META12]], !noalias [[META13]]
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI29]], ptr [[TMP59]], align 4, !alias.scope [[META14]], !noalias [[META15]]
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI30]], ptr [[TMP60]], align 4, !alias.scope [[META15]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP1]], align 4, !alias.scope [[META5]], !noalias [[META8]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI28]], ptr [[TMP2]], align 4, !alias.scope [[META12]], !noalias [[META13]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI29]], ptr [[TMP3]], align 4, !alias.scope [[META14]], !noalias [[META15]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI30]], ptr [[TMP4]], align 4, !alias.scope [[META15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP61]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -266,9 +258,9 @@ define void @test(ptr nocapture %asd, ptr nocapture %aud,
 ; UNROLL-NO-VF-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    ret void
@@ -364,11 +356,9 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META23:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META23:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META23]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META23]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], splat (i32 23)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
@@ -396,8 +386,7 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; CHECK:       pred.sdiv.continue4:
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP22]], [[PRED_SDIV_IF3]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i32> [[TMP24]], <2 x i32> [[TMP5]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP26]], align 4, !alias.scope [[META20]], !noalias [[META23]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP1]], align 4, !alias.scope [[META20]], !noalias [[META23]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
@@ -472,9 +461,9 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) {
 ; UNROLL-NO-VF-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    ret void
@@ -540,16 +529,14 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ASD]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META29:![0-9]+]], !noalias [[META32:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META29:![0-9]+]], !noalias [[META32:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[BSD]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META32]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META32]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], splat (i32 23)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], splat (i32 100)
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], splat (i1 true), !dbg [[DBG34:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP6]], splat (i1 true), !dbg [[DBG34:![0-9]+]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], splat (i32 200)
-; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer, !dbg [[DBG35:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer, !dbg [[DBG35:![0-9]+]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <2 x i1> [[TMP9]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
@@ -576,10 +563,9 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; CHECK:       pred.sdiv.continue4:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <2 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP26]], [[PRED_SDIV_IF3]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = xor <2 x i1> [[TMP8]], splat (i1 true), !dbg [[DBG35]]
-; CHECK-NEXT:    [[TMP30:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[TMP27]], <2 x i1> zeroinitializer, !dbg [[DBG35]]
+; CHECK-NEXT:    [[TMP30:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP27]], <2 x i1> zeroinitializer, !dbg [[DBG35]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[TMP5]], <2 x i32> [[TMP28]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP31]], align 4, !alias.scope [[META29]], !noalias [[META32]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP1]], align 4, !alias.scope [[META29]], !noalias [[META32]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
@@ -668,9 +654,9 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {;
 ; UNROLL-NO-VF-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    ret void
@@ -746,8 +732,7 @@ define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index 5c1f628b..b971400 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -67,9 +67,9 @@ define i32 @test(ptr nocapture %f) #0 {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NOSIMPLIFY:       scalar.ph:
-; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       for.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -94,8 +94,7 @@ define i32 @test(ptr nocapture %f) #0 {
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[F:%.*]], i64 [[INDEX]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; VEC-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; VEC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
 ; VEC-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -322,8 +321,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE2]] ]
 ; VEC-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
 ; VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[OFFSET_IDX]]
-; VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4
 ; VEC-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[INDVARS_IV3:%.*]] = add i64 [[OFFSET_IDX]], 0
@@ -452,10 +450,10 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NOSIMPLIFY:       scalar.ph:
-; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ 1000, [[ENTRY]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       for.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -482,8 +480,7 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; VEC-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
@@ -581,10 +578,10 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NOSIMPLIFY:       scalar.ph:
-; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ 1000, [[ENTRY]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NOSIMPLIFY:       for.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -612,9 +609,8 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) {
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[INDEX]]
-; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
-; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1
-; VEC-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP3]], align 1
+; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; VEC-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP2]], align 1
 ; VEC-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
@@ -714,7 +710,7 @@ define void @sdiv_with_uniform_ops(i16 %0, i1 %c, ptr %dst) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 98
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[SCALAR_PH]]
 ; UNROLL-NOSIMPLIFY:       scalar.ph:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 99, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -728,7 +724,7 @@ define void @sdiv_with_uniform_ops(i16 %0, i1 %c, ptr %dst) {
 ; UNROLL-NOSIMPLIFY:       loop.latch:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INC]] = add i16 [[IV]], 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[EC:%.*]] = icmp eq i16 [[INC]], 100
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       exit:
 ; UNROLL-NOSIMPLIFY-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
index b4aba56..65330aa 100644
--- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
@@ -33,8 +33,7 @@ define float @fcmp_0_fadd_select1(ptr noalias %x, i32 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
@@ -122,8 +121,7 @@ define double @fcmp_0_fadd_select2(ptr noalias %x, i32 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
@@ -214,8 +212,7 @@ define float @fcmp_val_fadd_select1(ptr noalias %x, float %y, i32 %N) nounwind r
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
@@ -306,8 +303,7 @@ define double @fcmp_val_fadd_select2(ptr noalias %x, double %y, i32 %N) nounwind
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
@@ -397,11 +393,9 @@ define float @fcmp_array_elm_fadd_select1(ptr noalias %x, ptr noalias %y, i32 %N
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[VEC_PHI]]
@@ -495,11 +489,9 @@ define double @fcmp_array_elm_fadd_select2(ptr noalias %x, ptr noalias %y, i32 %
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <4 x double> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7]] = select <4 x i1> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[VEC_PHI]]
@@ -591,8 +583,7 @@ define float @fcmp_0_fsub_select1(ptr noalias %x, i32 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
@@ -742,8 +733,7 @@ define double @fcmp_0_fsub_select2(ptr noalias %x, i32 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
@@ -894,8 +884,7 @@ define float @fcmp_0_fmult_select1(ptr noalias %x, i32 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
@@ -1046,8 +1035,7 @@ define double @fcmp_0_fmult_select2(ptr noalias %x, i32 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt <4 x double> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[VEC_PHI]]
@@ -1203,8 +1191,7 @@ define float @fcmp_multi(ptr nocapture readonly %a, i32 %n) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
@@ -1327,8 +1314,7 @@ define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonl
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI1:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uge <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
@@ -1586,8 +1572,7 @@ define i64 @fcmp_0_add_select2(ptr noalias %x, i64 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[VEC_PHI]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP4]], <4 x i64> [[VEC_PHI]]
@@ -1744,8 +1729,7 @@ define i32 @fcmp_0_mult_select1(ptr noalias %x, i32 %N) nounwind readonly {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 1, i32 1>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll
index 540170a..9cff1cf 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll
@@ -24,9 +24,9 @@ define void @multiple_iv_uses_in_same_instruction(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index 036d5f5..59f6e8b04f 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -47,8 +47,7 @@ define void @induction_with_global(i32 %init, ptr noalias nocapture %A, i32 %N)
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -137,8 +136,7 @@ define i32 @induction_with_loop_inv(i32 %init, ptr noalias nocapture %A, i32 %N,
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ [[INDUCTION4]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], [[DOTSPLAT6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -231,8 +229,7 @@ define void @non_primary_iv_loop_inv_trunc(ptr %a, i64 %n, i64 %step) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP6]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -287,16 +284,15 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[VEC_IND]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <8 x i64> [[VEC_IND]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -357,8 +353,7 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP6]], align 2
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP3]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -368,11 +363,11 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
@@ -428,8 +423,7 @@ define void @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP4]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -439,11 +433,11 @@ define void @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i16 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[SUB]] = sub i16 [[IV_2]], [[O_1]]
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i16 [[SUB]], ptr [[GEP_DST]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 5a5b06d..77b91cc 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -27,8 +27,7 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 190, i32 191>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -156,9 +155,8 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 190, i32 191>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP6]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -283,15 +281,13 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], [[OFFSET]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP8]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP7]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope [[META7]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP10]], align 4, !alias.scope [[META7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP12]]
-; CHECK-NEXT:    store <2 x float> [[TMP13]], ptr [[TMP8]], align 4, !alias.scope [[META4]], !noalias [[META7]]
+; CHECK-NEXT:    store <2 x float> [[TMP13]], ptr [[TMP7]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -473,21 +469,19 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], [[OFFSET]]
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 2
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP8]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP7]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[OFFSET2]]
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 2
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP12]], align 4, !alias.scope [[META7]]
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP13]], align 4, !alias.scope [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = fadd fast <2 x float> [[WIDE_LOAD]], [[TMP14]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = fadd fast <2 x float> [[WIDE_LOAD4]], [[TMP15]]
-; UNROLL-NO-IC-NEXT:    store <2 x float> [[TMP16]], ptr [[TMP8]], align 4, !alias.scope [[META4]], !noalias [[META7]]
+; UNROLL-NO-IC-NEXT:    store <2 x float> [[TMP16]], ptr [[TMP7]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; UNROLL-NO-IC-NEXT:    store <2 x float> [[TMP17]], ptr [[TMP9]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -628,8 +622,7 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -750,9 +743,8 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP4]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP5]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]]
@@ -1961,8 +1953,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
@@ -2185,9 +2176,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[PRED_UDIV_CONTINUE8]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[PRED_UDIV_CONTINUE8]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
-; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; UNROLL-NO-IC-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
@@ -2755,10 +2745,10 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2782,7 +2772,7 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; IND-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; IND-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IND-NEXT:    br label [[EXIT:%.*]]
 ; IND:       scalar.ph:
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       loop:
@@ -2801,7 +2791,7 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; UNROLL-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NEXT:    br label [[EXIT:%.*]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       loop:
@@ -2826,10 +2816,10 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[EXIT:%.*]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2853,7 +2843,7 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; INTERLEAVE-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; INTERLEAVE:       middle.block:
-; INTERLEAVE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label [[EXIT:%.*]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE:       loop:
@@ -2892,10 +2882,10 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2919,7 +2909,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; IND-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536
 ; IND-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IND-NEXT:    br label [[EXIT:%.*]]
 ; IND:       scalar.ph:
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       loop:
@@ -2938,7 +2928,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536
 ; UNROLL-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NEXT:    br label [[EXIT:%.*]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       loop:
@@ -2963,10 +2953,10 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[EXIT:%.*]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
@@ -2990,7 +2980,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536
 ; INTERLEAVE-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; INTERLEAVE:       middle.block:
-; INTERLEAVE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label [[EXIT:%.*]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE:       loop:
@@ -3384,8 +3374,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; CHECK-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -3589,9 +3578,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP13]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP15]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -3762,8 +3750,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; CHECK-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 8)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -3976,9 +3963,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP16]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 8)
@@ -4126,8 +4112,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -4232,9 +4217,8 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -4329,8 +4313,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -4456,9 +4439,8 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP9]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -4561,8 +4543,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -4685,9 +4666,8 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -4782,8 +4762,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -4902,9 +4881,8 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 4)
@@ -5037,10 +5015,10 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP15]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ -20, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -20, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -5101,7 +5079,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; IND-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
 ; IND:       middle.block:
 ; IND-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP14]])
-; IND-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IND-NEXT:    br label [[FOR_END:%.*]]
 ; IND:       scalar.ph:
 ; IND-NEXT:    br label [[FOR_BODY:%.*]]
 ; IND:       for.body:
@@ -5177,7 +5155,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; UNROLL:       middle.block:
 ; UNROLL-NEXT:    [[BIN_RDX:%.*]] = or <2 x i32> [[TMP27]], [[TMP26]]
 ; UNROLL-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]])
-; UNROLL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL:       for.body:
@@ -5257,10 +5235,10 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = or <2 x i32> [[TMP29]], [[TMP28]]
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_END:%.*]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ -20, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -20, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-IC:       for.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -5381,7 +5359,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = or <4 x i32> [[TMP47]], [[TMP46]]
 ; INTERLEAVE-NEXT:    [[TMP49:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[BIN_RDX]])
-; INTERLEAVE-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP49]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
@@ -5400,10 +5378,9 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; INTERLEAVE-NEXT:    [[VAR6]] = or i32 [[VAR0]], [[VAR4]]
 ; INTERLEAVE-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
 ; INTERLEAVE-NEXT:    [[COND:%.*]] = icmp eq i32 [[I_NEXT]], 0
-; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
+; INTERLEAVE-NEXT:    br i1 [[COND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; INTERLEAVE:       for.end:
-; INTERLEAVE-NEXT:    [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ poison, [[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i32 [[VAR7]]
+; INTERLEAVE-NEXT:    ret i32 [[VAR6]]
 ;
 entry:
   br label %for.body
@@ -5470,7 +5447,7 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP10]])
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i32 1
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
@@ -5478,7 +5455,7 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[DOTLCSSA]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[C5:%.*]] = phi i64 [ [[C23]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -5500,7 +5477,7 @@ define i64 @trunc_with_first_order_recurrence() {
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[C24]] = add nuw nsw i32 [[X]], 1
 ; CHECK-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 114
-; CHECK-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
 ;
 ; IND-LABEL: @trunc_with_first_order_recurrence(
 ; IND-NEXT:  entry:
@@ -5534,16 +5511,15 @@ define i64 @trunc_with_first_order_recurrence() {
 ; IND:       middle.block:
 ; IND-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP10]])
 ; IND-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND2]], i64 1
-; IND-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IND-NEXT:    br label [[SCALAR_PH]]
 ; IND:       scalar.ph:
 ; IND-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; IND-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ]
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       exit:
-; IND-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ poison, [[MIDDLE_BLOCK]] ]
-; IND-NEXT:    ret i64 [[DOTLCSSA]]
+; IND-NEXT:    ret i64 [[DOTLCSSA:%.*]]
 ; IND:       loop:
-; IND-NEXT:    [[C5:%.*]] = phi i64 [ [[C23]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IND-NEXT:    [[C5:%.*]] = phi i64 [ [[DOTLCSSA]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; IND-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ 113, [[SCALAR_PH]] ]
 ; IND-NEXT:    [[X:%.*]] = phi i32 [ [[C24:%.*]], [[LOOP]] ], [ 113, [[SCALAR_PH]] ]
 ; IND-NEXT:    [[Y:%.*]] = phi i32 [ [[C6:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
@@ -5558,11 +5534,11 @@ define i64 @trunc_with_first_order_recurrence() {
 ; IND-NEXT:    [[C14:%.*]] = shl i32 [[INDVARS_IV_TR]], 1
 ; IND-NEXT:    [[C15:%.*]] = add i32 [[C9]], [[C14]]
 ; IND-NEXT:    [[C16:%.*]] = sext i32 [[C15]] to i64
-; IND-NEXT:    [[C23]] = add i64 [[C13]], [[C16]]
+; IND-NEXT:    [[DOTLCSSA]] = add i64 [[C13]], [[C16]]
 ; IND-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; IND-NEXT:    [[C24]] = add nuw nsw i32 [[X]], 1
 ; IND-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 114
-; IND-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
+; IND-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
 ;
 ; UNROLL-LABEL: @trunc_with_first_order_recurrence(
 ; UNROLL-NEXT:  entry:
@@ -5612,16 +5588,15 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP21]], [[TMP20]]
 ; UNROLL-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; UNROLL-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD7]], i64 1
-; UNROLL-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NEXT:    br label [[SCALAR_PH]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; UNROLL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ]
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       exit:
-; UNROLL-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ poison, [[MIDDLE_BLOCK]] ]
-; UNROLL-NEXT:    ret i64 [[DOTLCSSA]]
+; UNROLL-NEXT:    ret i64 [[DOTLCSSA:%.*]]
 ; UNROLL:       loop:
-; UNROLL-NEXT:    [[C5:%.*]] = phi i64 [ [[C23]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; UNROLL-NEXT:    [[C5:%.*]] = phi i64 [ [[DOTLCSSA]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ 113, [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[X:%.*]] = phi i32 [ [[C24:%.*]], [[LOOP]] ], [ 113, [[SCALAR_PH]] ]
 ; UNROLL-NEXT:    [[Y:%.*]] = phi i32 [ [[C6:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
@@ -5636,11 +5611,11 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NEXT:    [[C14:%.*]] = shl i32 [[INDVARS_IV_TR]], 1
 ; UNROLL-NEXT:    [[C15:%.*]] = add i32 [[C9]], [[C14]]
 ; UNROLL-NEXT:    [[C16:%.*]] = sext i32 [[C15]] to i64
-; UNROLL-NEXT:    [[C23]] = add i64 [[C13]], [[C16]]
+; UNROLL-NEXT:    [[DOTLCSSA]] = add i64 [[C13]], [[C16]]
 ; UNROLL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NEXT:    [[C24]] = add nuw nsw i32 [[X]], 1
 ; UNROLL-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 114
-; UNROLL-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
+; UNROLL-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
 ;
 ; UNROLL-NO-IC-LABEL: @trunc_with_first_order_recurrence(
 ; UNROLL-NO-IC-NEXT:  entry:
@@ -5690,7 +5665,7 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP21]], [[TMP20]]
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD7]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 113, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
@@ -5698,7 +5673,7 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 42, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       exit:
-; UNROLL-NO-IC-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ]
 ; UNROLL-NO-IC-NEXT:    ret i64 [[DOTLCSSA]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[C5:%.*]] = phi i64 [ [[C23]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
@@ -5720,7 +5695,7 @@ define i64 @trunc_with_first_order_recurrence() {
 ; UNROLL-NO-IC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; UNROLL-NO-IC-NEXT:    [[C24]] = add nuw nsw i32 [[X]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 114
-; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
 ;
 ; INTERLEAVE-LABEL: @trunc_with_first_order_recurrence(
 ; INTERLEAVE-NEXT:  entry:
@@ -5770,16 +5745,15 @@ define i64 @trunc_with_first_order_recurrence() {
 ; INTERLEAVE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP21]], [[TMP20]]
 ; INTERLEAVE-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX]])
 ; INTERLEAVE-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i64 3
-; INTERLEAVE-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
 ; INTERLEAVE:       exit:
-; INTERLEAVE-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[C23:%.*]], [[LOOP]] ], [ poison, [[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i64 [[DOTLCSSA]]
+; INTERLEAVE-NEXT:    ret i64 [[DOTLCSSA:%.*]]
 ; INTERLEAVE:       loop:
-; INTERLEAVE-NEXT:    [[C5:%.*]] = phi i64 [ [[C23]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; INTERLEAVE-NEXT:    [[C5:%.*]] = phi i64 [ [[DOTLCSSA]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ 113, [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[X:%.*]] = phi i32 [ [[C24:%.*]], [[LOOP]] ], [ 113, [[SCALAR_PH]] ]
 ; INTERLEAVE-NEXT:    [[Y:%.*]] = phi i32 [ [[C6:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
@@ -5794,11 +5768,11 @@ define i64 @trunc_with_first_order_recurrence() {
 ; INTERLEAVE-NEXT:    [[C14:%.*]] = shl i32 [[INDVARS_IV_TR]], 1
 ; INTERLEAVE-NEXT:    [[C15:%.*]] = add i32 [[C9]], [[C14]]
 ; INTERLEAVE-NEXT:    [[C16:%.*]] = sext i32 [[C15]] to i64
-; INTERLEAVE-NEXT:    [[C23]] = add i64 [[C13]], [[C16]]
+; INTERLEAVE-NEXT:    [[DOTLCSSA]] = add i64 [[C13]], [[C16]]
 ; INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; INTERLEAVE-NEXT:    [[C24]] = add nuw nsw i32 [[X]], 1
 ; INTERLEAVE-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 114
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND_I]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP49:![0-9]+]]
 ;
 entry:
   br label %loop
@@ -5851,19 +5825,17 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -5906,7 +5878,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; IND-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; IND-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IND-NEXT:    br label [[EXIT:%.*]]
 ; IND:       scalar.ph:
 ; IND-NEXT:    br label [[LOOP:%.*]]
 ; IND:       loop:
@@ -5944,7 +5916,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NEXT:    br label [[EXIT:%.*]]
 ; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL:       loop:
@@ -5973,21 +5945,19 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP5]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP6]]
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP7]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP7]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP9]], ptr [[TMP11]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC-NEXT:    br label [[EXIT:%.*]]
 ; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-IC:       loop:
 ; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -6037,7 +6007,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; INTERLEAVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i64 3
-; INTERLEAVE-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label [[SCALAR_PH]]
 ; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
@@ -6056,7 +6026,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; INTERLEAVE-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], [[IV_TRUNC]]
 ; INTERLEAVE-NEXT:    store i32 [[ADD]], ptr [[DST_GEP]], align 4
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP51:![0-9]+]]
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP51:![0-9]+]]
 ; INTERLEAVE:       exit:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -6133,8 +6103,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; CHECK-NEXT:    [[VEC_IND]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[TMP20]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <2 x i32> [[TMP20]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -6357,9 +6326,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 2
-; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP20]], ptr [[TMP23]], align 4
+; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP20]], ptr [[TMP22]], align 4
 ; UNROLL-NO-IC-NEXT:    store <2 x i32> [[TMP21]], ptr [[TMP24]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[TMP17]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction_plus.ll b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
index 83490d3..c87dc7b 100644
--- a/llvm/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
@@ -10,8 +10,7 @@ define void @array_at_plus_one(i32 %n) {
 ; CHECK: [[VEC_IV_TRUNC:%.+]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ [[VEC_IV_TRUNC_NEXT:%.+]], %vector.body ]
 ; CHECK: [[T2:%.+]] = add nsw i64 %index, 12
 ; CHECK-NEXT: [[GEP:%.+]] = getelementptr inbounds [1024 x i32], ptr @array, i64 0, i64 [[T2]]
-; CHECK-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, ptr [[GEP]], i32 0
-; CHECK-NEXT: store <4 x i32> [[VEC_IV_TRUNC]], ptr [[GEP0]]
+; CHECK-NEXT: store <4 x i32> [[VEC_IV_TRUNC]], ptr [[GEP]]
 ; CHECK: [[VEC_IV_TRUNC_NEXT]] = add <4 x i32> [[VEC_IV_TRUNC]], splat (i32 4)
 ; CHECK: ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index b9d3356..3330f2b 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -11,17 +11,16 @@ define i32 @one_direct_branch(ptr %src) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -68,17 +67,16 @@ define i32 @two_direct_branch(ptr %src) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -135,8 +133,7 @@ define i32 @cond_branch(i32 %a, ptr %src) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> splat (i32 25500), [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP3]], <4 x i32> splat (i32 10)
@@ -146,9 +143,9 @@ define i32 @cond_branch(i32 %a, ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -211,9 +208,9 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index f038700..64caecc 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -284,7 +284,7 @@ define i16 @reduction_with_casts() {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label %scalar.ph
+; CHECK-NEXT:    br label %scalar.ph
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
index fb00762..8a48209 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
@@ -32,12 +32,11 @@ define void @i65_induction_with_negative_step(ptr %dst) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i65 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[STEP_ADD]], i32 3
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i65 [ -1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i65 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index a89945f..651210d 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -21,16 +21,15 @@ define void @gep_for_first_member_does_not_dominate_insert_point(ptr %str, ptr n
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i8> [[WIDE_VEC]], <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC2]], [[STRIDED_VEC]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 200, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 80ccc38..0afd2e1 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -47,7 +47,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -124,7 +124,7 @@ define void @test_struct_array_load3_store3() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -207,7 +207,7 @@ define i32 @test_struct_load4(ptr nocapture readonly %S) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -280,7 +280,7 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
@@ -366,7 +366,7 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
@@ -616,7 +616,7 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -678,7 +678,7 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
@@ -750,7 +750,7 @@ define void @mixed_load3_store3(ptr nocapture %A) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
@@ -835,7 +835,7 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
index 3f56978..752a0a0 100644
--- a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll
@@ -16,18 +16,11 @@ define void @test_invalidate_scevs_at_scope(ptr %p) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[EXIT_1:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_1:.*]]
@@ -37,42 +30,44 @@ define void @test_invalidate_scevs_at_scope(ptr %p) {
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[TMP4]], [[IV_1]]
 ; CHECK-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[IV_1]], 100
-; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_1]], label %[[LOOP_1]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_1:.*]], label %[[LOOP_1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT_1]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD_1]], %[[LOOP_1]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP4]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[ADD_LCSSA1:%.*]] = phi i32 [ [[ADD_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = add i32 [[DOTLCSSA]], 100
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD_LCSSA]], i32 100)
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[SMAX]], [[ADD_LCSSA]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SMAX]], -100
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[DOTLCSSA]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP7]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH2:.*]], label %[[VECTOR_PH3:.*]]
-; CHECK:       [[VECTOR_PH3]]:
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH1:.*]], label %[[VECTOR_PH2:.*]]
+; CHECK:       [[VECTOR_PH2]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP7]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY4:.*]]
-; CHECK:       [[VECTOR_BODY4]]:
-; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ 0, %[[VECTOR_PH3]] ], [ [[INDEX_NEXT8:%.*]], %[[VECTOR_BODY4]] ]
-; CHECK-NEXT:    [[VEC_IND6:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH3]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VECTOR_BODY4]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY3:.*]]
+; CHECK:       [[VECTOR_BODY3]]:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ 0, %[[VECTOR_PH2]] ], [ [[INDEX_NEXT8:%.*]], %[[VECTOR_BODY3]] ]
+; CHECK-NEXT:    [[VEC_IND6:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH2]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VECTOR_BODY3]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[INDEX5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[VEC_IND6]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <4 x i64> [[VEC_IND6]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i64> [[VEC_IND6]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_BODY4]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK1]]:
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK6:.*]], label %[[VECTOR_BODY3]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK6]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP7]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_2:.*]], label %[[SCALAR_PH2]]
-; CHECK:       [[SCALAR_PH2]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK1]] ], [ 0, %[[EXIT_1]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_2:.*]], label %[[SCALAR_PH1]]
+; CHECK:       [[SCALAR_PH1]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK6]] ], [ 0, %[[EXIT_1]] ]
 ; CHECK-NEXT:    br label %[[LOOP_2:.*]]
 ; CHECK:       [[LOOP_2]]:
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL9]], %[[SCALAR_PH2]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], %[[SCALAR_PH1]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ]
 ; CHECK-NEXT:    [[IV_2_TRUNC:%.*]] = trunc i64 [[IV_2]] to i32
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV_2]]
 ; CHECK-NEXT:    store i64 [[IV_2]], ptr [[GEP]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_LCSSA]], [[IV_2_TRUNC]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_LCSSA1]], [[IV_2_TRUNC]]
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp slt i32 [[ADD_2]], 100
 ; CHECK-NEXT:    br i1 [[C_2]], label %[[LOOP_2]], label %[[EXIT_2]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT_2]]:
diff --git a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll
index 9f82c93d..6eeeace 100644
--- a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll
+++ b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll
@@ -15,15 +15,14 @@ define void @d() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr @d, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[BROADCAST_SPLAT]], i32 0)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x float> zeroinitializer, <2 x float> splat (float 1.000000e+00)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I7:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index eb9c1cd..a0068f0 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -30,10 +30,10 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]])
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], 9223372036854775807
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331
-; IC1VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[SCALAR_PH]]:
-; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 19999, %[[ENTRY]] ]
-; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 19999, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ]
 ; IC1VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC1VF4:       [[LOOP]]:
 ; IC1VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -102,10 +102,10 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC4VF4-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[RDX_MINMAX11]])
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP18]], 9223372036854775807
 ; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331
-; IC4VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF4:       [[SCALAR_PH]]:
-; IC4VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 19999, %[[ENTRY]] ]
-; IC4VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 19999, %[[ENTRY]] ]
+; IC4VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ]
 ; IC4VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF4:       [[LOOP]]:
 ; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -162,10 +162,10 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC4VF1-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.smin.i64(i64 [[RDX_MINMAX4]], i64 [[TMP18]])
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], 9223372036854775807
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331
-; IC4VF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF1-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF1:       [[SCALAR_PH]]:
-; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 19999, %[[ENTRY]] ]
-; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 19999, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ]
 ; IC4VF1-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF1:       [[LOOP]]:
 ; IC4VF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -232,10 +232,10 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC1VF4-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[TMP5]])
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0
-; IC1VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[SCALAR_PH]]:
-; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
-; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; IC1VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC1VF4:       [[LOOP]]:
 ; IC1VF4-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -532,10 +532,10 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC4VF1-NEXT:    [[RDX_MINMAX5:%.*]] = call i16 @llvm.smin.i16(i16 [[RDX_MINMAX4]], i16 [[TMP22]])
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[RDX_MINMAX5]], 32767
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[RDX_MINMAX5]], i16 0
-; IC4VF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF1-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF1:       [[SCALAR_PH]]:
-; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
-; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; IC4VF1-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF1:       [[LOOP]]:
 ; IC4VF1-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -603,10 +603,10 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC1VF4-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> [[TMP5]])
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0
-; IC1VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[SCALAR_PH]]:
-; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
-; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; IC1VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC1VF4:       [[LOOP]]:
 ; IC1VF4-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -903,10 +903,10 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC4VF1-NEXT:    [[RDX_MINMAX5:%.*]] = call i16 @llvm.smin.i16(i16 [[RDX_MINMAX4]], i16 [[TMP22]])
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[RDX_MINMAX5]], 32767
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[RDX_MINMAX5]], i16 0
-; IC4VF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF1-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF1:       [[SCALAR_PH]]:
-; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[MIDDLE_BLOCK]] ], [ 12, %[[ENTRY]] ]
-; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; IC4VF1-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF1:       [[LOOP]]:
 ; IC4VF1-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -971,10 +971,10 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC1VF4-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP4]])
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -1
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331
-; IC1VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC1VF4:       [[SCALAR_PH]]:
-; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
-; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ]
+; IC1VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ]
 ; IC1VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC1VF4:       [[LOOP]]:
 ; IC1VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -1043,10 +1043,10 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC4VF4-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX11]])
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP18]], -1
 ; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331
-; IC4VF4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF4-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF4:       [[SCALAR_PH]]:
-; IC4VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
-; IC4VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ]
+; IC4VF4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ]
 ; IC4VF4-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF4:       [[LOOP]]:
 ; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -1103,10 +1103,10 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC4VF1-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX4]], i64 [[TMP18]])
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -1
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331
-; IC4VF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF1-NEXT:    br label %[[EXIT:.*]]
 ; IC4VF1:       [[SCALAR_PH]]:
-; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
-; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ]
+; IC4VF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ]
 ; IC4VF1-NEXT:    br label %[[LOOP:.*]]
 ; IC4VF1:       [[LOOP]]:
 ; IC4VF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
index c341605..24c5602 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
@@ -25,8 +25,7 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -83,11 +82,10 @@ define i64 @select_iv_def_from_outer_loop(ptr %a, i64 %start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 [[TMP1]]
-; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
index eefb327..c958ea7 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
@@ -16,11 +16,9 @@ define i64 @select_icmp_nuw_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP9]], 4
@@ -88,11 +86,9 @@ define i64 @select_icmp_nsw(ptr %a, ptr %b, i64 %ii, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP9]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index 6a2e3df..1054482 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -26,8 +26,7 @@ define i32 @select_icmp_const_truncated_iv_widened_exit(ptr %a, i32 %n) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -86,11 +85,10 @@ define i32 @select_icmp_const_truncated_iv_widened_exit(ptr %a, i32 %n) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
@@ -251,8 +249,7 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -263,10 +260,10 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 331
-; CHECK-VF4IC1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[FOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -300,11 +297,10 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
@@ -327,10 +323,10 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX8]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], -2147483648
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 331
-; CHECK-VF4IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[FOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -391,10 +387,10 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX4]], i32 [[TMP24]])
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 331
-; CHECK-VF1IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ]
 ; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[FOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -444,8 +440,7 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -456,10 +451,10 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 -1
-; CHECK-VF4IC1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[FOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -493,11 +488,10 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
@@ -520,10 +514,10 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX8]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], -2147483648
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 -1
-; CHECK-VF4IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[FOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -584,10 +578,10 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i32 @llvm.smax.i32(i32 [[RDX_MINMAX4]], i32 [[TMP24]])
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 -1
-; CHECK-VF1IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483648, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ]
 ; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[FOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -642,8 +636,7 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 2147483646, i32 2147483647, i32 -2147483648, i32 -2147483647>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = add i64 2147483646, [[INDEX]]
 ; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -654,10 +647,10 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP5]], 0
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP5]], i32 331
-; CHECK-VF4IC1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
-; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967294, %[[MIDDLE_BLOCK]] ], [ 2147483646, %[[ENTRY]] ]
-; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ]
+; CHECK-VF4IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ]
 ; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC1:       [[FOR_BODY]]:
 ; CHECK-VF4IC1-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -692,11 +685,10 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
 ; CHECK-VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2147483646, [[INDEX]]
 ; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
@@ -719,10 +711,10 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[RDX_MINMAX8]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP14]], 0
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP14]], i32 331
-; CHECK-VF4IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
-; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967294, %[[MIDDLE_BLOCK]] ], [ 2147483646, %[[ENTRY]] ]
-; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ]
+; CHECK-VF4IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ]
 ; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF4IC4:       [[FOR_BODY]]:
 ; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -785,10 +777,10 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX6:%.*]] = call i32 @llvm.umax.i32(i32 [[RDX_MINMAX5]], i32 [[TMP22]])
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX6]], 0
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX6]], i32 331
-; CHECK-VF1IC4-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4-NEXT:    br label %[[EXIT:.*]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
-; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967294, %[[MIDDLE_BLOCK]] ], [ 2147483646, %[[ENTRY]] ]
-; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ]
+; CHECK-VF1IC4-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ]
 ; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[FOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
index eab5d5e..fcaff55 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll
@@ -18,8 +18,7 @@ define i64 @select_icmp_const_1(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -70,11 +69,10 @@ define i64 @select_icmp_const_1(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
@@ -213,8 +211,7 @@ define i64 @select_icmp_const_2(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -265,11 +262,10 @@ define i64 @select_icmp_const_2(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
@@ -408,8 +404,7 @@ define i64 @select_icmp_const_3_variable_rdx_start(ptr %a, i64 %rdx.start, i64 %
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -460,11 +455,10 @@ define i64 @select_icmp_const_3_variable_rdx_start(ptr %a, i64 %rdx.start, i64 %
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
@@ -603,8 +597,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -655,11 +648,10 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
@@ -798,8 +790,7 @@ define i64 @select_fcmp_const(ptr %a, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -850,11 +841,10 @@ define i64 @select_fcmp_const(ptr %a, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
@@ -993,11 +983,9 @@ define i64 @select_icmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1050,20 +1038,18 @@ define i64 @select_icmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
@@ -1216,11 +1202,9 @@ define i64 @select_fcmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1273,20 +1257,18 @@ define i64 @select_fcmp(ptr %a, ptr %b, i64 %rdx.start, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
@@ -1440,11 +1422,9 @@ define i64 @select_icmp_min_valid_iv_start(ptr %a, ptr %b, i64 %rdx.start, i64 %
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805, i64 -9223372036854775804>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -9223372036854775808), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-VF4IC1-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1501,20 +1481,18 @@ define i64 @select_icmp_min_valid_iv_start(ptr %a, ptr %b, i64 %rdx.start, i64 %
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
@@ -1677,11 +1655,9 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC1-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 -9223372036854775808, i64 -9223372036854775807, i64 -9223372036854775806, i64 -9223372036854775805>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-VF4IC1-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1692,7 +1668,7 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP5]])
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP7]], i64 [[RDX_START]]
-; CHECK-VF4IC1-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC1-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK-VF4IC1:       [[SCALAR_PH]]:
 ; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -4, %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
 ; CHECK-VF4IC1-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 9223372036854775804, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -1711,9 +1687,9 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
 ; CHECK-VF4IC1-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
 ; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-VF4IC1:       [[EXIT]]:
-; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC1-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 ; CHECK-VF4IC4-LABEL: define i64 @select_icmp_unsigned_iv_range(
@@ -1733,20 +1709,18 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
@@ -1769,7 +1743,7 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC4-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[RDX_MINMAX12]])
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP19]], 0
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP19]], i64 [[RDX_START]]
-; CHECK-VF4IC4-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4IC4-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK-VF4IC4:       [[SCALAR_PH]]:
 ; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -16, %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
 ; CHECK-VF4IC4-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ 9223372036854775792, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -1788,9 +1762,9 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I]], 1
 ; CHECK-VF4IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
 ; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-VF4IC4:       [[EXIT]]:
-; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 ; CHECK-VF1IC4-LABEL: define i64 @select_icmp_unsigned_iv_range(
@@ -1845,7 +1819,7 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF1IC4-NEXT:    [[RDX_MINMAX5:%.*]] = call i64 @llvm.umax.i64(i64 [[RDX_MINMAX4]], i64 [[TMP29]])
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], 0
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 [[RDX_START]]
-; CHECK-VF1IC4-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF1IC4-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK-VF1IC4:       [[SCALAR_PH]]:
 ; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -4, %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ]
 ; CHECK-VF1IC4-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ 9223372036854775804, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -1864,9 +1838,9 @@ define i64 @select_icmp_unsigned_iv_range(ptr %a, ptr %b, i64 %rdx.start) {
 ; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV_I1]], 1
 ; CHECK-VF1IC4-NEXT:    [[INC3]] = add nsw i64 [[IV_J]], 1
 ; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806
-; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-VF1IC4:       [[EXIT]]:
-; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC4-NEXT:    [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    ret i64 [[COND_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 7b77354..97d3385 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -101,9 +101,9 @@ define i32 @constpre()  {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[MIDDLE_BLOCK]] ], [ 32, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -133,19 +133,19 @@ define ptr @geppre(ptr %ptr) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 512
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 512
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[IND_END]], i64 -16
-; CHECK-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -16
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[PTR]], %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
@@ -407,16 +407,15 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VEC-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
+; VEC-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP1]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
-; VEC-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC-NEXT:    br label %[[EXIT:.*]]
 ; VEC:       [[SCALAR_PH]]:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VEC-NEXT:    br label %[[LOOP:.*]]
 ; VEC:       [[LOOP]]:
 ; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -446,9 +445,9 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
 ; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -490,17 +489,16 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VEC-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
+; VEC-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP1]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
-; VEC-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC-NEXT:    br label %[[EXIT:.*]]
 ; VEC:       [[SCALAR_PH]]:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 2004, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; VEC-NEXT:    br label %[[LOOP:.*]]
 ; VEC:       [[LOOP]]:
 ; VEC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -532,10 +530,10 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1002
 ; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1002, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 2004, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -762,8 +760,7 @@ define float @fp_postinc_use_fadd(float %init, ptr noalias nocapture %A, i64 %N,
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP5]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
 ; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -878,8 +875,7 @@ define float @fp_postinc_use_fadd_ops_swapped(float %init, ptr noalias nocapture
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP5]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
 ; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -994,8 +990,7 @@ define float @fp_postinc_use_fsub(float %init, ptr noalias nocapture %A, i64 %N,
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4
+; VEC-NEXT:    store <2 x float> [[VEC_IND]], ptr [[TMP5]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = fsub fast <2 x float> [[VEC_IND]], [[DOTSPLAT4]]
 ; VEC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1099,16 +1094,15 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
 ; VEC-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; VEC-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 1
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP0]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; VEC-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP2]], align 2
+; VEC-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP1]], align 2
 ; VEC-NEXT:    [[TMP5:%.*]] = add i32 [[STEP_2]], [[TMP6]]
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; VEC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
-; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC-NEXT:    br label %[[E_EXIT:.*]]
 ; VEC:       [[SCALAR_PH]]:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; VEC-NEXT:    br label %[[LOOP:.*]]
 ; VEC:       [[LOOP]]:
 ; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -1141,9 +1135,9 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
 ; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[E_EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -1203,9 +1197,9 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) {
 ; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; VEC-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; VEC-NEXT:    br label %[[E_EXIT:.*]]
 ; VEC:       [[SCALAR_PH]]:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; VEC-NEXT:    br label %[[LOOP:.*]]
 ; VEC:       [[LOOP]]:
 ; VEC-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -1241,9 +1235,9 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) {
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
 ; INTERLEAVE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[E_EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -1286,20 +1280,14 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VEC:       [[VECTOR_BODY]]:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[OFFSET_IDX]]
-; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
-; VEC-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4
-; VEC-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1)
-; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
-; VEC-NEXT:    [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32
+; VEC-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP1]], align 4
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
-; VEC-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC-NEXT:    br label %[[SCALAR_PH]]
 ; VEC:       [[SCALAR_PH]]:
 ; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -1312,9 +1300,9 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-NEXT:    [[IV_1_NEXT]] = add nuw nsw i16 [[IV_1]], 1
 ; VEC-NEXT:    [[IV_1_EXT]] = zext nneg i16 [[IV_1_NEXT]] to i32
 ; VEC-NEXT:    [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
-; VEC-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
+; VEC-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ]
 ; VEC-NEXT:    ret i32 [[IV_1_EXT_LCSSA]]
 ;
 ; INTERLEAVE-LABEL: define i32 @iv_ext_used_outside(
@@ -1331,13 +1319,11 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP1]]
 ; INTERLEAVE-NEXT:    store i32 0, ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    store i32 0, ptr [[TMP3]], align 4
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add nuw nsw i16 [[TMP1]], 1
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = zext nneg i16 [[TMP4]] to i32
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; INTERLEAVE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[SCALAR_PH]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -1350,9 +1336,9 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; INTERLEAVE-NEXT:    [[IV_1_NEXT]] = add nuw nsw i16 [[IV_1]], 1
 ; INTERLEAVE-NEXT:    [[IV_1_EXT]] = zext nneg i16 [[IV_1_NEXT]] to i32
 ; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
-; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
+; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[EXIT]]:
-; INTERLEAVE-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; INTERLEAVE-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ]
 ; INTERLEAVE-NEXT:    ret i32 [[IV_1_EXT_LCSSA]]
 ;
 entry:
@@ -1390,10 +1376,10 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
 ; VEC-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VEC:       [[MIDDLE_BLOCK]]:
-; VEC-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VEC-NEXT:    br label %[[EXIT:.*]]
 ; VEC:       [[SCALAR_PH]]:
-; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
-; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[ENTRY]] ]
+; VEC-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 2, %[[ENTRY]] ]
 ; VEC-NEXT:    br label %[[LOOP:.*]]
 ; VEC:       [[LOOP]]:
 ; VEC-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
@@ -1423,10 +1409,10 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 1
 ; INTERLEAVE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ]
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 2, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll
index 8d88c62..2084833 100644
--- a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll
+++ b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll
@@ -128,7 +128,7 @@ define i32 @pr57508(ptr %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2000
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 2000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 2000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -139,7 +139,7 @@ define i32 @pr57508(ptr %src) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[LOCAL_NEXT]] = add i32 [[LOCAL]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 2000
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb:
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 8a326c9..2c7d1bd 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -19,8 +19,7 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -49,10 +48,10 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP15]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -116,8 +115,7 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -146,10 +144,10 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP15]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -230,12 +228,10 @@ define i32 @loop_requires_scev_predicate(ptr %dest, i32 %end) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
@@ -378,9 +374,9 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -490,9 +486,9 @@ define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %des
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -565,8 +561,7 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
@@ -585,10 +580,10 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -694,9 +689,9 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 511, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 511, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
index c6103f5..34c04de 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
@@ -52,10 +52,10 @@ define i8 @test_negative_off(i16 %len, ptr %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
-; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -988, [[MIDDLE_BLOCK]] ], [ -1000, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ -1000, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index 3373c6d..70e730f 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -67,7 +67,7 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -143,7 +143,7 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -190,8 +190,7 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i16> [[VEC_IND]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr @src, i16 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
@@ -214,7 +213,7 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -290,7 +289,7 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
@@ -340,8 +339,7 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i16 [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr @src, i16 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
@@ -364,7 +362,7 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll
index b418fa7..f99e883 100644
--- a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll
@@ -17,24 +17,21 @@ define void @accesses_to_struct_dereferenceable(ptr noalias %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -100,61 +97,58 @@ define void @accesses_to_struct_may_not_be_dereferenceable_due_to_loop_bound(ptr
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP12]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP18]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP25]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP28]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP29]], align 4
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP25]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> [[TMP26]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
@@ -177,7 +171,7 @@ define void @accesses_to_struct_may_not_be_dereferenceable_due_to_loop_bound(ptr
 ; CHECK-NEXT:    store i32 [[TMP_0]], ptr [[GEP_DST]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32001
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -222,64 +216,61 @@ define void @accesses_to_struct_may_not_be_dereferenceable_access_size(ptr noali
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP2]], splat (i1 true)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr @foo, i64 0, i32 1, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP6]], i32 0
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
 ; CHECK:       pred.load.if1:
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP9]], i64 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP12]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK:       pred.load.continue2:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i64> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i64> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
 ; CHECK:       pred.load.if3:
-; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP18]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; CHECK:       pred.load.continue4:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i64> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i64> [ [[TMP14]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.if5:
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP25]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP24]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.continue6:
-; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i64> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP28:%.*]] = trunc <4 x i64> [[TMP27]] to <4 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[TMP29]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP30]], align 4
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> [[TMP28]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <4 x i64> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP25]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc <4 x i64> [[TMP26]] to <4 x i32>
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> [[TMP27]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll
index 64b9b47..10b2e70 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-form.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll
@@ -19,8 +19,7 @@ define void @bottom_tested(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -128,8 +127,7 @@ define void @early_exit(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -195,8 +193,7 @@ define i32 @early_exit_with_live_out(ptr %ptr) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> splat (i32 10), ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i32> splat (i32 10), ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -330,8 +327,7 @@ define void @multiple_unique_exit(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -410,8 +406,7 @@ define i32 @multiple_unique_exit2(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -492,8 +487,7 @@ define i32 @multiple_unique_exit3(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -575,8 +569,7 @@ define i32 @multiple_exit_blocks(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -662,8 +655,7 @@ define i32 @multiple_exit_blocks2(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -753,8 +745,7 @@ define i32 @multiple_exit_blocks3(ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1077,8 +1068,7 @@ define void @scalar_predication(ptr %addr) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[ADDR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp une <2 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -1179,8 +1169,7 @@ define i32 @me_reduction(ptr %addr) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
diff --git a/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll b/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll
index 1d633dd..14a091f 100644
--- a/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll
+++ b/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll
@@ -66,9 +66,9 @@ define void @f(ptr noundef captures(none) %a, float noundef %x) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
@@ -112,8 +112,7 @@ define void @f(ptr noundef captures(none) %a, float noundef %x) {
 ; CHECK-NEXT:    [[MUL_7:%.*]] = fmul float [[X]], [[LOAD_7]]
 ; CHECK-NEXT:    store float [[MUL_7]], ptr [[ARRAYIDX_7]], align 4
 ; CHECK-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
-; CHECK-NEXT:    [[COMP_7:%.*]] = icmp eq i64 [[IV_NEXT_7]], 1024
-; CHECK-NEXT:    br i1 [[COMP_7]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index 287a15e..ce9c624 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -18,14 +18,12 @@ define void @fp_math(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[WIDE_LOAD]], splat (double 9.900000e+01), !fpmath [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp oge <2 x double> [[TMP3]], splat (double 1.000000e+01)
-; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x double> [[WIDE_LOAD]], <2 x double> zeroinitializer, !fpmath [[META3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fptrunc <2 x double> [[TMP5]] to <2 x float>, !fpmath [[META3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP4]], <2 x double> [[WIDE_LOAD]], <2 x double> zeroinitializer, !fpmath [[META3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <2 x double> [[TMP6]] to <2 x float>, !fpmath [[META3]]
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -64,22 +62,20 @@ define void @fp_math(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD]], splat (double 9.900000e+01), !fpmath [[META3:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD1]], splat (double 9.900000e+01), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = fcmp oge <2 x double> [[TMP4]], splat (double 1.000000e+01)
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = fcmp oge <2 x double> [[TMP5]], splat (double 1.000000e+01)
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> [[WIDE_LOAD]], <2 x double> zeroinitializer, !fpmath [[META3]]
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> [[WIDE_LOAD1]], <2 x double> zeroinitializer, !fpmath [[META3]]
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP6]], <2 x double> [[WIDE_LOAD]], <2 x double> zeroinitializer, !fpmath [[META3]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x double> [[WIDE_LOAD1]], <2 x double> zeroinitializer, !fpmath [[META3]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = fptrunc <2 x double> [[TMP11]] to <2 x float>, !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = fptrunc <2 x double> [[TMP8]] to <2 x float>, !fpmath [[META3]]
-; INTERLEAVE-NEXT:    [[TMP11:%.*]] = fptrunc <2 x double> [[TMP9]] to <2 x float>, !fpmath [[META3]]
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 2
-; INTERLEAVE-NEXT:    store <2 x float> [[TMP10]], ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; INTERLEAVE-NEXT:    store <2 x float> [[TMP11]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    store <2 x float> [[TMP9]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    store <2 x float> [[TMP10]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -137,19 +133,17 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @foo_vector_fixed2_nomask(<2 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -173,24 +167,22 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE:       [[VECTOR_BODY]]:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @foo_vector_fixed2_nomask(<2 x i64> [[WIDE_LOAD]])
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @foo_vector_fixed2_nomask(<2 x i64> [[WIDE_LOAD1]])
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 2
-; INTERLEAVE-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP6]], align 4
+; INTERLEAVE-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 4
 ; INTERLEAVE-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP7]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -233,19 +225,17 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @bar_vector_fixed2_nomask(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @bar_vector_fixed2_nomask(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -269,24 +259,22 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE:       [[VECTOR_BODY]]:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[TMP0]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call <2 x double> @bar_vector_fixed2_nomask(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call <2 x double> @bar_vector_fixed2_nomask(<2 x double> [[WIDE_LOAD1]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 2
-; INTERLEAVE-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP6]], align 8
+; INTERLEAVE-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP5]], align 8
 ; INTERLEAVE-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP7]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -329,19 +317,17 @@ define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[WIDE_LOAD]], i1 true)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -365,24 +351,22 @@ define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE:       [[VECTOR_BODY]]:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[TMP0]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 4
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[WIDE_LOAD]], i1 true)
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[WIDE_LOAD1]], i1 true)
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 2
-; INTERLEAVE-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP6]], align 4
+; INTERLEAVE-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 4
 ; INTERLEAVE-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP7]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -425,19 +409,17 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -461,24 +443,22 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE:       [[VECTOR_BODY]]:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[TMP0]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8, !tbaa [[TBAA0]]
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP2]], align 8, !tbaa [[TBAA0]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[WIDE_LOAD]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[WIDE_LOAD1]]), !fpmath [[META3]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 2
-; INTERLEAVE-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP6]], align 8
+; INTERLEAVE-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP5]], align 8
 ; INTERLEAVE-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP7]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
-; INTERLEAVE-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
 ; INTERLEAVE:       [[SCALAR_PH]]:
-; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
 ; INTERLEAVE:       [[LOOP]]:
 ; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -525,12 +505,10 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]], !custom_md [[META2:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]], !custom_md [[META2:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store <2 x ptr> [[TMP2]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -575,13 +553,11 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[VEC_IND]]
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
-; INTERLEAVE-NEXT:    store <2 x i32> [[VEC_IND1]], ptr [[TMP4]], align 4
+; INTERLEAVE-NEXT:    store <2 x i32> [[VEC_IND1]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    store <2 x i32> [[STEP_ADD3]], ptr [[TMP5]], align 4
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i32 0
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i32 2
-; INTERLEAVE-NEXT:    store <2 x ptr> [[TMP1]], ptr [[TMP6]], align 8
+; INTERLEAVE-NEXT:    store <2 x ptr> [[TMP1]], ptr [[TMP0]], align 8
 ; INTERLEAVE-NEXT:    store <2 x ptr> [[TMP2]], ptr [[TMP7]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
diff --git a/llvm/test/Transforms/LoopVectorize/min-trip-count-known-via-scev.ll b/llvm/test/Transforms/LoopVectorize/min-trip-count-known-via-scev.ll
index 6ced1f1..445ef03 100644
--- a/llvm/test/Transforms/LoopVectorize/min-trip-count-known-via-scev.ll
+++ b/llvm/test/Transforms/LoopVectorize/min-trip-count-known-via-scev.ll
@@ -20,8 +20,7 @@ define i32 @loop_with_at_least_2_iterations_via_guards_order_1(ptr %dst, i32 %n)
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> splat (i32 1), ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i32> splat (i32 1), ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -88,8 +87,7 @@ define i32 @loop_with_at_least_2_iterations_via_guards_order_2(ptr %dst, i32 %n)
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> splat (i32 1), ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i32> splat (i32 1), ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -162,8 +160,7 @@ define void @loop_never_executes_precondition_order_1(i64 %start, ptr %dst) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[START]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -236,8 +233,7 @@ define void @loop_never_executes_precondition_order_1_predicates_flipped(i64 %st
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[START]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -307,8 +303,7 @@ define void @loop_never_executes_precondition_order_2_predicates_flipped(i64 %st
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[START]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    store <2 x i64> [[VEC_IND]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
index 6dde2b9..e26fef4 100644
--- a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
@@ -14,9 +14,8 @@ define float @maximumnum_intrinsic(ptr readonly %x) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
@@ -26,10 +25,10 @@ define float @maximumnum_intrinsic(ptr readonly %x) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
@@ -73,9 +72,8 @@ define float @maximumnum_intrinsic_fast(ptr readonly %x) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
@@ -85,10 +83,10 @@ define float @maximumnum_intrinsic_fast(ptr readonly %x) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
@@ -132,9 +130,8 @@ define float @minimumnum_intrinsic(ptr readonly %x) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
@@ -144,10 +141,10 @@ define float @minimumnum_intrinsic(ptr readonly %x) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
@@ -191,9 +188,8 @@ define float @minimumnum_intrinsic_fast(ptr readonly %x) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[GEP]], i32 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI]], <2 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[VEC_PHI1]], <2 x float> [[WIDE_LOAD2]])
@@ -203,10 +199,10 @@ define float @minimumnum_intrinsic_fast(ptr readonly %x) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index ca0edb3..cc246d55 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -33,7 +33,7 @@ define i32 @main() #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 40000
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
index 3af6ecd..d21621e 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
@@ -65,17 +65,15 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4, !alias.scope [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[TMP14]], [[WIDE_LOAD8]]
-; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP16]], align 4, !alias.scope [[META5]], !noalias [[META7]]
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP15]], align 4, !alias.scope [[META5]], !noalias [[META7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -131,17 +129,15 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 {
 ; CHECK-HOIST:       vector.body:
 ; CHECK-HOIST-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-HOIST-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 0, i64 [[INDEX]]
-; CHECK-HOIST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-HOIST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-HOIST-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META0:![0-9]+]]
 ; CHECK-HOIST-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope [[META3:![0-9]+]]
 ; CHECK-HOIST-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
 ; CHECK-HOIST-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-HOIST-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
 ; CHECK-HOIST-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[INDEX]]
-; CHECK-HOIST-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-HOIST-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-HOIST-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; CHECK-HOIST-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP8]], [[WIDE_LOAD5]]
-; CHECK-HOIST-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP10]], align 4, !alias.scope [[META5]], !noalias [[META7]]
+; CHECK-HOIST-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP9]], align 4, !alias.scope [[META5]], !noalias [[META7]]
 ; CHECK-HOIST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-HOIST-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-HOIST-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 6a17b5f..11c8991 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -25,7 +25,7 @@ define void @narrow_select_to_single_scalar(i1 %invar.cond, ptr noalias %A, ptr
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
@@ -39,7 +39,7 @@ define void @narrow_select_to_single_scalar(i1 %invar.cond, ptr noalias %A, ptr
 ; CHECK-NEXT:    store i16 0, ptr [[GEP_C]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i16 [[IV]], 1024
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
index 7d33f62..404ef09 100644
--- a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
+++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
@@ -37,8 +37,7 @@ define i32 @test(ptr %arr, i64 %n) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 65), ptr [[TMP19]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 65), ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
index ba85bb4..02f3242 100644
--- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -28,9 +28,9 @@ define i32 @test1()  {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -263,8 +263,8 @@ define i32 @test4(i32 %N)  {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B_PROMOTED]], [[N_VEC]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[B_PROMOTED]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
@@ -669,15 +669,12 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N)  {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[B_PROMOTED]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store <2 x i32> [[TMP11]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -742,17 +739,15 @@ define i32 @non_uniform_live_out() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 7)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    store <2 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    store <2 x i8> [[TMP4]], ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 20000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
@@ -765,9 +760,9 @@ define i32 @non_uniform_live_out() {
 ; CHECK-NEXT:    store i8 [[BUMP]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 20000
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ [[I_09]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ [[I_09]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_OUT:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[LCSSA]]
 ; CHECK-NEXT:    store i8 42, ptr [[ARRAYIDX_OUT]], align 1
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll
index 6f3736c..496285a 100644
--- a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll
+++ b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll
@@ -49,12 +49,10 @@ define void @test_ptr_iv_no_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end)
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[OFFSET_IDX10:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[OFFSET_IDX10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[NEXT_GEP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[NEXT_GEP]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x float>, ptr [[NEXT_GEP11]], align 4, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = fadd <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD12]]
-; CHECK-NEXT:    store <2 x float> [[TMP19]], ptr [[TMP17]], align 4, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    store <2 x float> [[TMP19]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -137,12 +135,10 @@ define void @test_ptr_iv_with_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1_START]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[OFFSET_IDX8:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[P2_START]], i64 [[OFFSET_IDX8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP13]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[NEXT_GEP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP14]], align 4, !alias.scope [[META12]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[NEXT_GEP]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[NEXT_GEP9]], align 4, !alias.scope [[META12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD10]]
-; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP13]], align 4, !alias.scope [[META9]], !noalias [[META12]]
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META9]], !noalias [[META12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -208,8 +204,7 @@ define void @store_pointer_induction(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <2 x i64> <i64 0, i64 8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr ptr, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x ptr> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x ptr> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
index 6695450..ee74f22 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
@@ -29,11 +29,9 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK:       vector.body:
 ; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; VF-TWO-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -54,11 +52,9 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
 ; VF-TWO-CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX6]]
-; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x i32>, ptr [[TMP10]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x i32>, ptr [[TMP9]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX6]]
-; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i32>, ptr [[TMP12]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i32>, ptr [[TMP11]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
 ; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 1480bc9..a79a8dd 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -31,15 +31,12 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[BB:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[AA:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -58,15 +55,12 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP14]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -187,8 +181,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[REVERSE]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -219,8 +212,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; CHECK-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x float> [[WIDE_LOAD9]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <4 x float> [[REVERSE10]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX7]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP28]], ptr [[TMP30]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP28]], ptr [[TMP29]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -326,8 +318,7 @@ define void @f3(ptr noalias %A, i64 %n) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x i8> splat (i8 1), ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <4 x i8> splat (i8 1), ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -346,8 +337,7 @@ define void @f3(ptr noalias %A, i64 %n) {
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i8> splat (i8 1), ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <4 x i8> splat (i8 1), ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -383,8 +373,7 @@ define void @f3(ptr noalias %A, i64 %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT:       vector.body:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> splat (i8 1), ptr [[TMP2]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> splat (i8 1), ptr [[TMP1]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -403,8 +392,7 @@ define void @f3(ptr noalias %A, i64 %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT:       vec.epilog.vector.body:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> splat (i8 1), ptr [[TMP6]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> splat (i8 1), ptr [[TMP5]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 2
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -477,8 +465,7 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[VEC_IND]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <4 x i8> [[VEC_IND]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84
@@ -507,8 +494,7 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i8> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX17:%.*]] = add i64 1, [[INDEX7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[OFFSET_IDX17]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[VEC_IND15]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    store <4 x i8> [[VEC_IND15]], ptr [[TMP11]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT18]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <4 x i8> [[VEC_IND15]], [[DOTSPLAT14]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT18]], 84
@@ -565,8 +551,7 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> [[VEC_IND]], ptr [[TMP6]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> [[VEC_IND]], ptr [[TMP5]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], [[DOTSPLAT2]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 84
@@ -595,8 +580,7 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND15:%.*]] = phi <2 x i8> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[OFFSET_IDX17:%.*]] = add i64 1, [[INDEX7]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[OFFSET_IDX17]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> [[VEC_IND15]], ptr [[TMP12]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> [[VEC_IND15]], ptr [[TMP11]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT18]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT16]] = add <2 x i8> [[VEC_IND15]], [[DOTSPLAT14]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT18]], 84
@@ -667,8 +651,7 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[VEC_IND]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -694,8 +677,7 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-NEXT:    [[VEC_IND7:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i32> [[VEC_IND7]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP7]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP7]], ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT8]] = add <4 x i32> [[VEC_IND7]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
@@ -735,8 +717,7 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[VEC_IND]] to <4 x i8>
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP3]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP2]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -762,8 +743,7 @@ define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <2 x i32> [[VEC_IND7]] to <2 x i8>
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX6]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> [[TMP7]], ptr [[TMP9]], align 1
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    store <2 x i8> [[TMP7]], ptr [[TMP8]], align 1
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], splat (i32 2)
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 38e436e..f0d026b 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -200,16 +200,15 @@ define i32 @foo_pgso() !prof !14 {
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[TMP0]]
-; NPGSO-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; NPGSO-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; NPGSO-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; NPGSO-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
 ; NPGSO-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i8> splat (i8 2), <4 x i8> splat (i8 1)
-; NPGSO-NEXT:    store <4 x i8> [[TMP4]], ptr [[TMP2]], align 1
+; NPGSO-NEXT:    store <4 x i8> [[TMP4]], ptr [[TMP1]], align 1
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 4
 ; NPGSO-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 200
 ; NPGSO-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
-; NPGSO-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; NPGSO-NEXT:    br label %[[SCALAR_PH]]
 ; NPGSO:       [[SCALAR_PH]]:
 ; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 200, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; NPGSO-NEXT:    br label %[[FOR_BODY:.*]]
@@ -222,7 +221,7 @@ define i32 @foo_pgso() !prof !14 {
 ; NPGSO-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
 ; NPGSO-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
 ; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; NPGSO:       [[FOR_END]]:
 ; NPGSO-NEXT:    ret i32 0
 ;
@@ -274,9 +273,9 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP28:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY29:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP28]]:
 ; CHECK-NEXT:    unreachable
@@ -312,9 +311,9 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
 ; PGSO-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
-; PGSO-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP28:.*]], label %[[SCALAR_PH]]
+; PGSO-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
 ; PGSO:       [[SCALAR_PH]]:
-; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; PGSO-NEXT:    br label %[[FOR_BODY29:.*]]
 ; PGSO:       [[FOR_COND_CLEANUP28]]:
 ; PGSO-NEXT:    unreachable
@@ -350,9 +349,9 @@ define void @pr43371() optsize {
 ; NPGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
 ; NPGSO-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
-; NPGSO-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP28:.*]], label %[[SCALAR_PH]]
+; NPGSO-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
 ; NPGSO:       [[SCALAR_PH]]:
-; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; NPGSO-NEXT:    br label %[[FOR_BODY29:.*]]
 ; NPGSO:       [[FOR_COND_CLEANUP28]]:
 ; NPGSO-NEXT:    unreachable
@@ -411,9 +410,9 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP28:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY29:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP28]]:
 ; CHECK-NEXT:    unreachable
@@ -449,9 +448,9 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
 ; PGSO-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
-; PGSO-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP28:.*]], label %[[SCALAR_PH]]
+; PGSO-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
 ; PGSO:       [[SCALAR_PH]]:
-; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
 ; PGSO-NEXT:    br label %[[FOR_BODY29:.*]]
 ; PGSO:       [[FOR_COND_CLEANUP28]]:
 ; PGSO-NEXT:    unreachable
@@ -479,15 +478,14 @@ define void @pr43371_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 undef, [[OFFSET_IDX]]
 ; NPGSO-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
-; NPGSO-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[TMP3]], i32 0
-; NPGSO-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP4]], align 1
+; NPGSO-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP3]], align 1
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; NPGSO-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
 ; NPGSO-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
-; NPGSO-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP28:.*]], label %[[SCALAR_PH]]
+; NPGSO-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
 ; NPGSO:       [[SCALAR_PH]]:
-; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; NPGSO-NEXT:    br label %[[FOR_BODY29:.*]]
 ; NPGSO:       [[FOR_COND_CLEANUP28]]:
 ; NPGSO-NEXT:    unreachable
@@ -628,8 +626,6 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; NPGSO-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
-; NPGSO-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; NPGSO:       [[SCALAR_PH]]:
 ; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 508, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; NPGSO-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 5, %[[ENTRY]] ]
@@ -639,9 +635,9 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[PIVPLUS1]], %[[LOOP]] ]
 ; NPGSO-NEXT:    [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1
 ; NPGSO-NEXT:    [[COND:%.*]] = icmp ult i32 [[PIV]], 510
-; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP24:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP24:![0-9]+]]
 ; NPGSO:       [[EXIT]]:
-; NPGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; NPGSO-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
 ; NPGSO-NEXT:    ret i32 [[FOR_LCSSA]]
 ;
 entry:
@@ -842,13 +838,12 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
@@ -859,7 +854,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -875,13 +870,12 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO:       [[VECTOR_BODY]]:
 ; PGSO-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PGSO-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP0]]
-; PGSO-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; PGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP2]], align 4
+; PGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
-; PGSO-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; PGSO-NEXT:    br label %[[SCALAR_PH]]
 ; PGSO:       [[SCALAR_PH]]:
 ; PGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; PGSO-NEXT:    br label %[[FOR_BODY:.*]]
@@ -892,7 +886,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; PGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; PGSO:       [[FOR_END]]:
 ; PGSO-NEXT:    ret void
 ;
@@ -908,13 +902,12 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP0]]
-; NPGSO-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; NPGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP2]], align 4
+; NPGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; NPGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
-; NPGSO-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; NPGSO-NEXT:    br label %[[SCALAR_PH]]
 ; NPGSO:       [[SCALAR_PH]]:
 ; NPGSO-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; NPGSO-NEXT:    br label %[[FOR_BODY:.*]]
@@ -925,7 +918,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; NPGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; NPGSO:       [[FOR_END]]:
 ; NPGSO-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
index aee80c9..667c6a5 100644
--- a/llvm/test/Transforms/LoopVectorize/phi-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
@@ -189,7 +189,7 @@ define i32 @red_phi_0(i32 %start, ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP0]])
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
index 4c04d96d..1bc98f9 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
@@ -22,14 +22,12 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    store ptr [[TMP5]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP8]], align 8
@@ -42,7 +40,7 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ]
@@ -58,7 +56,7 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i32 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-unroll.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-unroll.ll
index 97907b2..f36292b 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-unroll.ll
@@ -60,7 +60,7 @@ define void @non_constant_scalar_expansion(i32 %0, ptr %call) {
 ; STRIDED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967264
 ; STRIDED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; STRIDED:       middle.block:
-; STRIDED-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; STRIDED-NEXT:    br label [[SCALAR_PH]]
 ; STRIDED:       scalar.ph:
 ; STRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -2, [[MIDDLE_BLOCK]] ], [ 30, [[ENTRY:%.*]] ]
 ; STRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ null, [[ENTRY]] ]
@@ -73,7 +73,7 @@ define void @non_constant_scalar_expansion(i32 %0, ptr %call) {
 ; STRIDED-NEXT:    store ptr [[P_0]], ptr [[ARRAYIDX]], align 4
 ; STRIDED-NEXT:    [[INC]] = add i32 [[TMP24]], 1
 ; STRIDED-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP24]], 0
-; STRIDED-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
 ; STRIDED:       for.end:
 ; STRIDED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index a4f2b07..d1ae9ce 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -146,13 +146,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[VECTOR_GEP]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP2]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP2]], ptr [[NEXT_GEP]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[WIDE_LOAD]], splat (i8 1)
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -240,14 +238,13 @@ define void @non_constant_vector_expansion(i32 %0, ptr %call) {
 ; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP4]]
 ; STRIDED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
 ; STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr ptr, ptr [[CALL:%.*]], i32 [[OFFSET_IDX]]
-; STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr ptr, ptr [[TMP6]], i32 0
-; STRIDED-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP7]], align 4
+; STRIDED-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 4
 ; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]]
 ; STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; STRIDED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; STRIDED:       middle.block:
-; STRIDED-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; STRIDED-NEXT:    br label [[SCALAR_PH]]
 ; STRIDED:       scalar.ph:
 ; STRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; STRIDED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ null, [[ENTRY]] ]
@@ -260,7 +257,7 @@ define void @non_constant_vector_expansion(i32 %0, ptr %call) {
 ; STRIDED-NEXT:    store ptr [[P_0]], ptr [[ARRAYIDX]], align 4
 ; STRIDED-NEXT:    [[INC]] = add i32 [[TMP9]], 1
 ; STRIDED-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP9]], 100
-; STRIDED-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 ; STRIDED:       for.end:
 ; STRIDED-NEXT:    ret void
 ;
@@ -311,11 +308,9 @@ define void @outside_lattice(ptr noalias %p, ptr noalias %q, i32 %n) {
 ; DEFAULT-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
-; DEFAULT-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; DEFAULT-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP5]], align 8
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; DEFAULT-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; DEFAULT-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; DEFAULT-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
@@ -367,11 +362,9 @@ define void @outside_lattice(ptr noalias %p, ptr noalias %q, i32 %n) {
 ; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
 ; STRIDED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
 ; STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
-; STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
-; STRIDED-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; STRIDED-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP5]], align 8
 ; STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
-; STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; STRIDED-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; STRIDED-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP7]], align 4
 ; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
diff --git a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
index b2da8c7..c491477 100644
--- a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -81,8 +81,7 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -203,8 +202,7 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -397,8 +395,7 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [250 x i32], ptr @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr35773.ll b/llvm/test/Transforms/LoopVectorize/pr35773.ll
index 2f14655..b716515 100644
--- a/llvm/test/Transforms/LoopVectorize/pr35773.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr35773.ll
@@ -14,8 +14,7 @@ define void @doit1(ptr %ptr) {
 ; CHECK-NEXT:    [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
 
 ; CHECK-NEXT:    [[GEP1:%.+]] = getelementptr inbounds i32, ptr %ptr, i32 [[MAIN_IV]]
-; CHECK-NEXT:    [[GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP1]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[I32_IV]], ptr [[GEP2]], align 4
+; CHECK-NEXT:    store <4 x i32> [[I32_IV]], ptr [[GEP1]], align 4
 
 ; CHECK-NEXT:    [[MAIN_IV_NEXT]] = add nuw i32 [[MAIN_IV]], 4
 ; CHECK-NEXT:    [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], splat (i32 36)
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
index fde93d7..83646e9 100644
--- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -58,14 +58,13 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP15]], i32 2
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 3
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP22]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK:       for.body:
 ; CHECK:       for.end.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
index e7b87b0..315ea12 100644
--- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
@@ -44,9 +44,9 @@ define i16 @test_true_and_false_branch_equal() {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 111, [[MIDDLE_BLOCK]] ], [ 99, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 99, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC7:%.*]], [[FOR_LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index c7f2d7a..fade726 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -48,8 +48,7 @@ define i8 @widget(ptr %arr, i8 %t9) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[ARR]], i8 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp slt <4 x i8> [[TMP11]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i1> [[TMP14]] to <4 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP15]], ptr [[TMP16]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP15]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index b6aea9c..c044cc0 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -253,8 +253,8 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; CHECK:       pred.store.continue6:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
@@ -320,8 +320,8 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF2UF2:       pred.store.continue6:
 ; VF2UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2UF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
-; VF2UF2-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF2UF2-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2UF2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF2UF2-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2UF2:       middle.block:
 ; VF2UF2-NEXT:    br label [[FOR_END:%.*]]
 ; VF2UF2:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
index 6aaa443..7e1a449 100644
--- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
@@ -47,14 +47,14 @@ define void @f() {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    store i32 0, ptr @f.e, align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    store i8 10, ptr [[TMP0]], align 1
+; CHECK-NEXT:    store i8 10, ptr [[TMP0]], align 1, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll
index fb7947e..14bcfde 100644
--- a/llvm/test/Transforms/LoopVectorize/pr50686.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll
@@ -18,26 +18,25 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P2]], align 4, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> zeroinitializer, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX9_1]], align 4, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX9_2]], align 4, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP4]], [[BROADCAST_SPLAT5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP7]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_END17:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND5:%.*]]
@@ -53,7 +52,7 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) {
 ; CHECK-NEXT:    store i32 [[SUB_2]], ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 63
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END17]], label [[FOR_COND5]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END17:%.*]], label [[FOR_COND5]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       for.end17:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index bd96471..a28bdb8 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -34,10 +34,10 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 182, [[MIDDLE_BLOCK]] ], [ 6, [[BB:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 35902, [[BB]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 6, [[BB:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 35902, [[BB]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
index af1c146..98da2b2 100644
--- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
@@ -31,8 +31,7 @@ define void @test1_pr58811() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[INDUCTION_IV_LCSSA]]
-; CHECK-NEXT:    br i1 false, label [[LOOP_3_PREHEADER:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_2_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_2_PREHEADER]] ]
@@ -43,9 +42,9 @@ define void @test1_pr58811() {
 ; CHECK-NEXT:    [[IV_4_NEXT]] = sub i32 [[IV_4]], [[IV_1_LCSSA]]
 ; CHECK-NEXT:    [[IV_3_NEXT]] = add i16 [[IV_3]], 1
 ; CHECK-NEXT:    [[CMP88_1:%.*]] = icmp ult i16 [[IV_3]], 198
-; CHECK-NEXT:    br i1 [[CMP88_1]], label [[LOOP_2]], label [[LOOP_3_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP88_1]], label [[LOOP_2]], label [[LOOP_3_PREHEADER:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.3.preheader:
-; CHECK-NEXT:    [[IV_4_LCSSA:%.*]] = phi i32 [ [[IV_4]], [[LOOP_2]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[IV_4_LCSSA:%.*]] = phi i32 [ [[IV_4]], [[LOOP_2]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.3:
 ; CHECK-NEXT:    [[IV_5:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_3]] ], [ 0, [[LOOP_3_PREHEADER]] ]
@@ -126,8 +125,7 @@ define void @test2_pr58811() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[INDUCTION_IV_LCSSA]]
-; CHECK-NEXT:    br i1 false, label [[LOOP_4_PREHEADER:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ]
@@ -138,9 +136,9 @@ define void @test2_pr58811() {
 ; CHECK-NEXT:    [[SUB93_1]] = sub i32 [[IV_5]], [[IV_2_LCSSA]]
 ; CHECK-NEXT:    [[INC_1]] = add i16 [[IV_4]], 1
 ; CHECK-NEXT:    [[CMP88_1:%.*]] = icmp ult i16 [[IV_4]], 198
-; CHECK-NEXT:    br i1 [[CMP88_1]], label [[LOOP_3]], label [[LOOP_4_PREHEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP88_1]], label [[LOOP_3]], label [[LOOP_4_PREHEADER:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop.4.preheader:
-; CHECK-NEXT:    [[IV_5_LCSSA:%.*]] = phi i32 [ [[IV_5]], [[LOOP_3]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[IV_5_LCSSA:%.*]] = phi i32 [ [[IV_5]], [[LOOP_3]] ]
 ; CHECK-NEXT:    br label [[LOOP_4]]
 ; CHECK:       loop.4:
 ; CHECK-NEXT:    [[IV_6:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_4]] ], [ 0, [[LOOP_4_PREHEADER]] ]
@@ -204,8 +202,7 @@ define void @test3_pr58811() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[TMP3]]
-; CHECK-NEXT:    br i1 false, label [[LOOP_4_PREHEADER:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ]
@@ -216,9 +213,9 @@ define void @test3_pr58811() {
 ; CHECK-NEXT:    [[SUB93_1]] = sub i32 [[IV_4]], [[ADD101_LCSSA]]
 ; CHECK-NEXT:    [[INC_1]] = add i16 [[IV_3]], 1
 ; CHECK-NEXT:    [[CMP88_1:%.*]] = icmp ult i16 [[IV_3]], 198
-; CHECK-NEXT:    br i1 [[CMP88_1]], label [[LOOP_3]], label [[LOOP_4_PREHEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP88_1]], label [[LOOP_3]], label [[LOOP_4_PREHEADER:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       loop.4.preheader:
-; CHECK-NEXT:    [[IV_4_LCSSA:%.*]] = phi i32 [ [[IV_4]], [[LOOP_3]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[IV_4_LCSSA:%.*]] = phi i32 [ [[IV_4]], [[LOOP_3]] ]
 ; CHECK-NEXT:    br label [[LOOP_4:%.*]]
 ; CHECK:       loop.4:
 ; CHECK-NEXT:    [[IV_5:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_4]] ], [ 0, [[LOOP_4_PREHEADER]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index b5f9e99..a39fd47 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -21,9 +21,9 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 true, label [[PREHEADER:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_1:%.*]]
 ; CHECK:       loop.1:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP_1]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
index c4a10e1..724aed8 100644
--- a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
@@ -22,8 +22,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; IC1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; IC1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]]
 ; IC1-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]]
-; IC1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[NEXT_GEP]], align 1
 ; IC1-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], splat (i8 -12)
 ; IC1-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], splat (i8 13)
 ; IC1-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP7]], [[TMP4]]
@@ -120,9 +119,8 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; IC2-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]]
 ; IC2-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]]
 ; IC2-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
-; IC2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; IC2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
-; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP5]], align 1
+; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[NEXT_GEP]], align 1
 ; IC2-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP6]], align 1
 ; IC2-NEXT:    [[TMP13:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], splat (i8 -12)
 ; IC2-NEXT:    [[TMP14:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], splat (i8 -12)
@@ -340,21 +338,21 @@ define void @switch_to_header(ptr %start) {
 ; IC1-NEXT:  [[ENTRY:.*]]:
 ; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC1:       [[LOOP_HEADER]]:
-; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN:.*]] ]
+; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ]
 ; IC1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IC1-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [
-; IC1-NEXT:      i64 120, label %[[IF_THEN]]
+; IC1-NEXT:      i64 120, label %[[IF_THEN1]]
 ; IC1-NEXT:      i64 100, label %[[LOOP_LATCH]]
 ; IC1-NEXT:    ]
-; IC1:       [[IF_THEN]]:
+; IC1:       [[IF_THEN1]]:
 ; IC1-NEXT:    br label %[[LOOP_HEADER]]
-; IC1:       [[IF_THEN1:.*:]]
+; IC1:       [[IF_THEN:.*:]]
 ; IC1-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison
 ; IC1-NEXT:    store i64 42, ptr [[GEP]], align 1
 ; IC1-NEXT:    unreachable
 ; IC1:       [[LOOP_LATCH]]:
 ; IC1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN]]
+; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]]
 ; IC1:       [[EXIT]]:
 ; IC1-NEXT:    ret void
 ;
@@ -363,21 +361,21 @@ define void @switch_to_header(ptr %start) {
 ; IC2-NEXT:  [[ENTRY:.*]]:
 ; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC2:       [[LOOP_HEADER]]:
-; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN:.*]] ]
+; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ]
 ; IC2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IC2-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [
-; IC2-NEXT:      i64 120, label %[[IF_THEN]]
+; IC2-NEXT:      i64 120, label %[[IF_THEN1]]
 ; IC2-NEXT:      i64 100, label %[[LOOP_LATCH]]
 ; IC2-NEXT:    ]
-; IC2:       [[IF_THEN]]:
+; IC2:       [[IF_THEN1]]:
 ; IC2-NEXT:    br label %[[LOOP_HEADER]]
-; IC2:       [[IF_THEN1:.*:]]
+; IC2:       [[IF_THEN:.*:]]
 ; IC2-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison
 ; IC2-NEXT:    store i64 42, ptr [[GEP]], align 1
 ; IC2-NEXT:    unreachable
 ; IC2:       [[LOOP_LATCH]]:
 ; IC2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN]]
+; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]]
 ; IC2:       [[EXIT]]:
 ; IC2-NEXT:    ret void
 ;
@@ -415,15 +413,14 @@ define void @switch_all_to_default(ptr %start) {
 ; IC1:       [[VECTOR_BODY]]:
 ; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[INDEX]]
-; IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; IC1-NEXT:    store <2 x i64> splat (i64 42), ptr [[TMP2]], align 1
+; IC1-NEXT:    store <2 x i64> splat (i64 42), ptr [[TMP1]], align 1
 ; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; IC1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC1:       [[MIDDLE_BLOCK]]:
-; IC1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1-NEXT:    br label %[[EXIT:.*]]
 ; IC1:       [[SCALAR_PH]]:
-; IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC1:       [[LOOP_HEADER]]:
 ; IC1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -449,17 +446,16 @@ define void @switch_all_to_default(ptr %start) {
 ; IC2:       [[VECTOR_BODY]]:
 ; IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[INDEX]]
-; IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
-; IC2-NEXT:    store <2 x i64> splat (i64 42), ptr [[TMP4]], align 1
+; IC2-NEXT:    store <2 x i64> splat (i64 42), ptr [[TMP2]], align 1
 ; IC2-NEXT:    store <2 x i64> splat (i64 42), ptr [[TMP5]], align 1
 ; IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; IC2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; IC2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC2:       [[MIDDLE_BLOCK]]:
-; IC2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC2-NEXT:    br label %[[EXIT:.*]]
 ; IC2:       [[SCALAR_PH]]:
-; IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; IC2:       [[LOOP_HEADER]]:
 ; IC2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
index 0a97510..cd44c3d 100644
--- a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
@@ -261,3 +261,55 @@ loop.latch:                                       ; preds = %cond.false, %loop.h
 exit:                                             ; preds = %loop.latch
   ret void
 }
+
+; Test case for https://github.com/llvm/llvm-project/issues/149347.
+; FIXME: Currently mis-compiles.
+define void @test_store_to_invariant_address_needs_mask_due_to_low_trip_count(ptr %dst) {
+; CHECK-LABEL: define void @test_store_to_invariant_address_needs_mask_due_to_low_trip_count(
+; CHECK-SAME: ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    store i32 0, ptr [[DST]], align 4
+; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ 1, %[[LOOP_HEADER]] ], [ 0, %[[ELSE]] ]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 true, label %loop.latch, label %else
+
+else:
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ 1, %loop.header ], [ 0, %else ]
+  store i32 %merge, ptr %dst, align 4
+  %iv.next = add i16 %iv, 1
+  %ec = icmp eq i16 %iv.next, 3
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-or-disjoint.ll b/llvm/test/Transforms/LoopVectorize/preserve-or-disjoint.ll
index 3323844..b6156a8 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-or-disjoint.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-or-disjoint.ll
@@ -15,12 +15,11 @@ define void @generate_disjoint_flags(i64 %n, ptr noalias %x) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll
index 3488f52..c1cc9e5 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll
@@ -19,7 +19,7 @@ define i32 @reduction_smin(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -66,7 +66,7 @@ define i32 @reduction_smin_select_ops_flipped(ptr nocapture %A, ptr nocapture %B
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -113,7 +113,7 @@ define i32 @reduction_smin_intrinsic(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -161,7 +161,7 @@ define i32 @reduction_umax(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -208,7 +208,7 @@ define i32 @reduction_umax_select_ops_flipped(ptr nocapture %A, ptr nocapture %B
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -255,7 +255,7 @@ define i32 @reduction_umax_intrinsic(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index fbe3a7a..795605d 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -1370,7 +1370,7 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index d9819ae..12d83eb 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -38,7 +38,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add i32 [[TMP9]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add i32 [[TMP11]], [[BIN_RDX7]]
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -118,7 +118,7 @@ define i64 @reduction_sum_chain(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i64 [[TMP19]], [[TMP17]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add i64 [[TMP21]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX12:%.*]] = add i64 [[TMP23]], [[BIN_RDX11]]
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index e762c9f..b302868 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -20,7 +20,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -73,7 +73,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -124,7 +124,7 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -178,7 +178,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -235,7 +235,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -289,7 +289,7 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -340,7 +340,7 @@ define i32 @start_at_non_zero(ptr nocapture %in, ptr nocapture %coeff, ptr nocap
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -392,7 +392,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -443,7 +443,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -494,7 +494,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -544,7 +544,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -596,7 +596,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -645,7 +645,7 @@ define i32 @reduction_sub_lhs(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -705,7 +705,7 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -836,7 +836,7 @@ define i32 @reduction_predicated(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -887,7 +887,7 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP2]])
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -936,7 +936,7 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[TMP2]])
-; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
@@ -1445,7 +1445,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
index c4f51d9..daf4cba 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll
@@ -8,10 +8,9 @@ define i32 @reduction_sum(i64 %n, ptr noalias nocapture %A) {
 ; UF3-NEXT:   [[SUM1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM1_NEXT:%.+]], %vector.body ]
 ; UF3-NEXT:   [[SUM2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM2_NEXT:%.+]],  %vector.body ]
 ; UF3-NEXT:   [[GEP0:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV]]
-; UF3-NEXT:   [[L_GEP0:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 0
 ; UF3-NEXT:   [[L_GEP1:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 4
 ; UF3-NEXT:   [[L_GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 8
-; UF3-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4
+; UF3-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[GEP0]], align 4
 ; UF3-NEXT:   [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4
 ; UF3-NEXT:   [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4
 ; UF3-NEXT:   [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]]
@@ -35,12 +34,11 @@ define i32 @reduction_sum(i64 %n, ptr noalias nocapture %A) {
 ; UF5-NEXT:   [[SUM3:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM3_NEXT:%.+]], %vector.body ]
 ; UF5-NEXT:   [[SUM4:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM4_NEXT:%.+]], %vector.body ]
 ; UF5-NEXT:   [[GEP0:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV]]
-; UF5-NEXT:   [[L_GEP0:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 0
 ; UF5-NEXT:   [[L_GEP1:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 4
 ; UF5-NEXT:   [[L_GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 8
 ; UF5-NEXT:   [[L_GEP3:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 12
 ; UF5-NEXT:   [[L_GEP4:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 16
-; UF5-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4
+; UF5-NEXT:   [[L0:%.+]] = load <4 x i32>, ptr [[GEP0]], align 4
 ; UF5-NEXT:   [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4
 ; UF5-NEXT:   [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4
 ; UF5-NEXT:   [[L3:%.+]] = load <4 x i32>, ptr [[L_GEP3]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 7579fbc..b8c2405 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -15,8 +15,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
@@ -24,7 +23,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[GEP_DST:%.*]], align 4
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 define void @reduc_store(ptr %dst, ptr readonly %src) {
 entry:
   %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
@@ -253,7 +252,7 @@ for.end:
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]])
 ; CHECK-NEXT:    store i32 [[TMP36]], ptr [[GEP_DST:%.*]], align 4
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 define void @reduc_store_inside_unrolled(ptr %dst, ptr readonly %src) {
 entry:
   %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
@@ -522,7 +521,7 @@ define void @test_drop_poison_generating_dead_recipe(ptr %dst) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]])
 ; CHECK-NEXT:    store i64 [[TMP2]], ptr [[DST:%.*]], align 8
-; CHECK-NEXT:    br i1 false, label %exit, label %scalar.ph
+; CHECK-NEXT:    br label %scalar.ph
 ; CHECK:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index 757be04..aa1ac25 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -776,7 +776,7 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll
index 581ccbf..be0e0d1 100644
--- a/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll
@@ -16,18 +16,17 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[BODY:.*]]
 ; CHECK:       [[BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index ce4270d..4612545 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -104,27 +104,23 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-LABEL: define void @runtime_checks_ptr_inductions(
 ; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], i1 [[C:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[DST_11:%.*]] = ptrtoint ptr [[DST_1]] to i64
 ; CHECK-NEXT:    br label %[[LOOP_1:.*]]
 ; CHECK:       [[LOOP_1]]:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[DST_1]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @val()
 ; CHECK-NEXT:    [[SEL_DST:%.*]] = select i1 [[C]], ptr [[DST_1]], ptr [[DST_2]]
 ; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 1
 ; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[LOOP_2_HEADER_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_HEADER_PREHEADER]]:
-; CHECK-NEXT:    [[SEL_DST_LCSSA2:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[SEL_DST_LCSSA1:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[PTR_IV_1_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SEL_DST_LCSSA:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[SEL_DST_LCSSA23:%.*]] = ptrtoint ptr [[SEL_DST_LCSSA2]] to i64
+; CHECK-NEXT:    [[SEL_DST_LCSSA12:%.*]] = ptrtoint ptr [[SEL_DST_LCSSA1]] to i64
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVAR_LCSSA]], [[DST_11]]
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SEL_DST_LCSSA23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SEL_DST_LCSSA12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -135,10 +131,8 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_IV_1_LCSSA]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[SEL_DST_LCSSA]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <2 x i8> [[WIDE_LOAD]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[NEXT_GEP4]], align 1
+; CHECK-NEXT:    store <2 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1022
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -146,13 +140,13 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1023, %[[MIDDLE_BLOCK]] ], [ 1, %[[LOOP_2_HEADER_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
 ; CHECK:       [[LOOP_2_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC7:%.*]], %[[LOOP_2_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i32 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[EC_2]], label %[[EXIT:.*]], label %[[LOOP_2_LATCH]]
 ; CHECK:       [[LOOP_2_LATCH]]:
@@ -194,3 +188,287 @@ loop.2.latch:
 exit:
   ret void
 }
+
+
+declare void @foo()
+declare void @bar()
+
+define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprogress {
+; CHECK-LABEL: define void @expand_diff_scev_unknown(
+; CHECK-SAME: ptr [[DST:%.*]], i1 [[INVAR_C:%.*]], i32 [[STEP:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[STEP]], %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    [[INDVAR_LCSSA1:%.*]] = phi i32 [ [[INDVAR]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STEP]], -2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR_LCSSA1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[UMIN]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMIN]]
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[IV_1_LCSSA]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[IV_1_LCSSA]], [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[IV_1_LCSSA]], %[[LOOP_2_PREHEADER]] ], [ [[IV_1_LCSSA]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2]], [[STEP]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp slt i32 [[IV_2_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC_2]], label %[[LOOP_2]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i32 [ %step, %entry ], [ %iv.1.next, %loop.1 ]
+  call void @foo()
+  %iv.1.next = add i32 %iv.1, 1
+  br i1 %invar.c, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.2 = phi i32 [ %iv.1, %loop.1 ], [ %iv.2.next, %loop.2 ]
+  %iv.2.next = add nsw i32 %iv.2, %step
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv.2
+  store i32 0, ptr %gep.dst
+  %ec.2 = icmp slt i32 %iv.2.next, 0
+  br i1 %ec.2, label %loop.2, label %exit
+
+exit:
+  ret void
+}
+
+define void @expand_diff_neg_ptrtoint_expr(ptr %src, ptr %start) {
+; CHECK-LABEL: define void @expand_diff_neg_ptrtoint_expr(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 8
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_NEXT]], 32
+; CHECK-NEXT:    br i1 [[EC_1]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    [[PTR_IV_1_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[IV_NEXT_1:%.*]], %[[LOOP_2]] ], [ 1, %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2]] ], [ [[PTR_IV_1_NEXT_LCSSA]], %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i64 8
+; CHECK-NEXT:    [[IV_NEXT_1]] = add i64 [[IV_1]], 1
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i64 [[IV_NEXT_1]], 32
+; CHECK-NEXT:    br i1 [[EC_2]], label %[[LOOP_3_PREHEADER:.*]], label %[[LOOP_2]]
+; CHECK:       [[LOOP_3_PREHEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[PTR_IV_2_NEXT]], %[[LOOP_2]] ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[SRC2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP5]], [[TMP0]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -16
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = mul i64 [[INDEX]], 8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[OFFSET_IDX5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], -2
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 1, %[[LOOP_3_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[TMP1]], %[[LOOP_3_PREHEADER]] ], [ [[TMP1]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_NEXT_2:%.*]], %[[LOOP_3]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_3]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[IV_2]], -1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP12]]
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP_SRC]], align 8
+; CHECK-NEXT:    [[PTR_IV_3_NEXT]] = getelementptr i8, ptr [[PTR_IV_3]], i64 8
+; CHECK-NEXT:    store i64 [[L]], ptr [[PTR_IV_3]], align 8
+; CHECK-NEXT:    [[IV_NEXT_2]] = add i64 [[IV_2]], 1
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_2]], 0
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP_3]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.1 ]
+  %ptr.iv.1 = phi ptr [ %start, %entry ], [ %ptr.iv.1.next, %loop.1 ]
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 8
+  call void @foo()
+  %iv.next = add i64 %iv, 1
+  %ec.1 = icmp eq i64 %iv.next, 32
+  br i1 %ec.1, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.1 = phi i64 [ 1, %loop.1 ], [ %iv.next.1, %loop.2 ]
+  %ptr.iv.2 = phi ptr [ %ptr.iv.1.next, %loop.1 ], [ %ptr.iv.2.next, %loop.2 ]
+  call void @bar()
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 8
+  %iv.next.1 = add i64 %iv.1, 1
+  %ec.2 = icmp eq i64 %iv.next.1, 32
+  br i1 %ec.2, label %loop.3, label %loop.2
+
+loop.3:
+  %iv.2 = phi i64 [ 1, %loop.2 ], [ %iv.next.2, %loop.3 ]
+  %ptr.iv.3 = phi ptr [ %ptr.iv.2.next, %loop.2 ], [ %ptr.iv.3.next, %loop.3 ]
+  %6 = add i64 %iv.2, -1
+  %gep.src = getelementptr double, ptr %src, i64 %6
+  %l = load i64, ptr %gep.src, align 8
+  %ptr.iv.3.next = getelementptr i8, ptr %ptr.iv.3, i64 8
+  store i64 %l, ptr %ptr.iv.3, align 8
+  %iv.next.2 = add i64 %iv.2, 1
+  %ec.3 = icmp eq i64 %iv.next.2, 0
+  br i1 %ec.3, label %exit, label %loop.3
+
+exit:
+  ret void
+}
+
+
+declare i1 @cond()
+
+define void @scev_exp_reuse_const_add(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @scev_exp_reuse_const_add(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 2
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_2_PH:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_2_PH]]:
+; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PTR_IV_1_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVAR_LCSSA]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_IV_1_NEXT_LCSSA]], i64 80
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_IV_1_NEXT_LCSSA]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    store <2 x i16> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 40
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 40, %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_2_PH]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_NEXT_LCSSA]], %[[LOOP_2_PH]] ], [ [[PTR_IV_1_NEXT_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_1]], 1
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV_2_NEXT]]
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP_SRC_1]], align 2
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i64 2
+; CHECK-NEXT:    store i16 [[L]], ptr [[PTR_IV_2]], align 2
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], 40
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_2]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %ptr.iv.1 = phi ptr [ %dst, %entry ], [ %ptr.iv.1.next, %loop.1 ]
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 2
+  %c = call i1 @cond()
+  br i1 %c, label %loop.2.ph, label %loop.1
+
+loop.2.ph:
+  br label %loop.2
+
+loop.2:
+  %iv.1 = phi i64 [ 0, %loop.2.ph ], [ %iv.2.next, %loop.2 ]
+  %ptr.iv.2 = phi ptr [ %ptr.iv.1.next, %loop.2.ph ], [ %ptr.iv.2.next, %loop.2 ]
+  %iv.2.next = add i64 %iv.1, 1
+  %gep.src.1 = getelementptr i16, ptr %src, i64 %iv.2.next
+  %l = load i16, ptr %gep.src.1, align 2
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 2
+  store i16 %l, ptr %ptr.iv.2, align 2
+  %ec = icmp eq i64 %iv.1, 40
+  br i1 %ec, label %exit, label %loop.2
+
+exit:
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare double @llvm.cos.f64(double) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
index 8c32ce20..cea16c9 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -12,7 +12,6 @@ define i32 @reverse_induction_i64(i64 %startval, ptr %ptr) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL]], 1024
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -37,11 +36,11 @@ define i32 @reverse_induction_i64(i64 %startval, ptr %ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label %[[LOOPEND:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[LOOPEND:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[STARTVAL]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ]
@@ -84,7 +83,6 @@ define i32 @reverse_induction_i128(i128 %startval, ptr %ptr) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i128 [[STARTVAL]], 1024
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i128 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -109,11 +107,11 @@ define i32 @reverse_induction_i128(i128 %startval, ptr %ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label %[[LOOPEND:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[LOOPEND:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i128 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[STARTVAL]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i128 [ [[STARTVAL]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i128 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ]
@@ -165,7 +163,6 @@ define i32 @reverse_induction_i16(i16 %startval, ptr %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[TMP2]], [[MUL_OVERFLOW]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i16 [[STARTVAL]], 1024
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -191,11 +188,11 @@ define i32 @reverse_induction_i16(i16 %startval, ptr %ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT:    br i1 true, label %[[LOOPEND:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[LOOPEND:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[STARTVAL]], %[[ENTRY]] ], [ [[STARTVAL]], %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ [[STARTVAL]], %[[ENTRY]] ], [ [[STARTVAL]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ]
@@ -278,10 +275,10 @@ define void @reverse_forward_induction_i64_i8() {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[WHILE_END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK:       [[WHILE_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
@@ -346,10 +343,10 @@ define void @reverse_forward_induction_i64_i8_signed() {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[WHILE_END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 1023, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ -127, %[[MIDDLE_BLOCK]] ], [ -127, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ -127, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK:       [[WHILE_BODY]]:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-known-true.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-known-true.ll
new file mode 100644
index 0000000..830bd92
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-known-true.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @test_runtime_check_known_false_after_construction(ptr %start.1, ptr %start.2, ptr %end) {
+; CHECK-LABEL: define void @test_runtime_check_known_false_after_construction(
+; CHECK-SAME: ptr [[START_1:%.*]], ptr [[START_2:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[GEP_START_2:%.*]] = getelementptr i8, ptr [[START_2]], i64 8
+; CHECK-NEXT:    [[START_1_INT:%.*]] = ptrtoint ptr [[START_1]] to i64
+; CHECK-NEXT:    [[START_2_INT:%.*]] = ptrtoint ptr [[GEP_START_2]] to i64
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[START_1_INT]], [[START_2_INT]]
+; CHECK-NEXT:    [[START_2_DIFF:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[DIFF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START_1_INT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2305843009213693951
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[END1]] to i3
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[START_1_INT]] to i3
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i3 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i3 [[TMP7]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], -8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], -8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[START_2_DIFF]], i64 [[TMP11]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], -8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], -8
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START_2_DIFF]], i64 [[OFFSET_IDX2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[NEXT_GEP3]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP15]], i32 -3
+; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i64> [[REVERSE]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i64> [[REVERSE4]], ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[START_1]], %[[ENTRY]] ], [ [[START_1]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ [[START_2_DIFF]], %[[ENTRY]] ], [ [[START_2_DIFF]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i64 -8
+; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 -8
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[PTR_IV_2]], align 8
+; CHECK-NEXT:    store i64 [[L]], ptr [[PTR_IV_1]], align 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_2]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.start.2 = getelementptr i8, ptr %start.2, i64 8
+  %start.1.int = ptrtoint ptr %start.1 to i64
+  %start.2.int = ptrtoint ptr %gep.start.2 to i64
+  %diff = sub i64 %start.1.int, %start.2.int
+  %start.2.diff = getelementptr i8, ptr %start.2, i64 %diff
+  br label %loop
+
+loop:
+  %ptr.iv.1 = phi ptr [ %ptr.iv.1.next, %loop ], [ %start.1, %entry ]
+  %ptr.iv.2 = phi ptr [ %ptr.iv.2.next, %loop ], [ %start.2.diff, %entry ]
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 -8
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 -8
+  %l = load i64, ptr %ptr.iv.2, align 8
+  store i64 %l, ptr %ptr.iv.1, align 8
+  %ec = icmp eq ptr %ptr.iv.2, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
index db88eaa..44d2925 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
@@ -17,12 +17,10 @@ define void @test(ptr %A, i32 %x) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x float> [[WIDE_LOAD]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store <4 x float> [[WIDE_LOAD]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index a47037c4..1035642 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -33,12 +33,10 @@ define void @load_clamped_index(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = urem i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -105,12 +103,10 @@ define void @store_clamped_index(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = urem i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -185,12 +181,10 @@ define void @load_clamped_index_offset_1(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = urem i32 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -369,10 +363,9 @@ define void @clamped_index_equal_dependence(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = urem i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 10)
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index 22d9a53..c5838fe 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -429,7 +429,7 @@ define dso_local void @forced_optsize(ptr noalias nocapture readonly %x_p, ptr n
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll
index 6bf47c5..1d64499 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll
@@ -60,52 +60,40 @@ define void @test_large_number_of_group(ptr %dst, i64 %off, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[INDEX]], -5
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[OFF]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[TMP12]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], [[OFF_MUL_2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP11]], [[OFF_MUL_3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP18]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP20]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP11]], [[OFF_MUL_4]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[TMP22]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP21]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP11]], [[OFF_MUL_5]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP24]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP11]], [[OFF_MUL_6]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[TMP11]], [[OFF_MUL_7]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[TMP31]], i32 0
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP30]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP32]], align 8
 ; CHECK-NEXT:    [[TMP33:%.*]] = add i64 [[TMP11]], [[OFF_MUL_8]]
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP33]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP11]], [[OFF_MUL_9]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP37]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP36]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP38]], align 8
 ; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP11]], [[OFF_MUL_10]]
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[TMP40]], i32 0
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP39]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP41]], align 8
 ; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP11]], [[OFF_MUL_11]]
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP42]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP44]], align 8
 ; CHECK-NEXT:    [[TMP45:%.*]] = add i64 [[TMP11]], [[OFF_MUL_12]]
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr double, ptr [[TMP46]], i32 0
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP45]]
 ; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[TMP47]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -242,20 +230,17 @@ define void @check_creation_order(ptr %a, ptr %b, i32 %m) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[INVARIANT_GEP]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x double>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[WIDE_LOAD]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 31996
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 31996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -270,7 +255,7 @@ define void @check_creation_order(ptr %a, ptr %b, i32 %m) {
 ; CHECK-NEXT:    store double [[ADD3]], ptr [[GEP_A]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 31999
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
index cdf04cd..2590ccb 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
@@ -69,15 +69,13 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP18]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP15]]
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP16]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
@@ -86,12 +84,12 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP10]]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = add nsw i64 [[IV_INNER]], [[TMP11]]
-; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP22]]
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP10]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[IV_INNER]], [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP20]]
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[ARRAYIDX9_US]], align 4
 ; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
 ; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
 ; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -189,16 +187,14 @@ define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef reado
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META9:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP8]], align 4, !alias.scope [[META12]], !noalias [[META9]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4, !alias.scope [[META12]], !noalias [[META9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
@@ -207,12 +203,12 @@ define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef reado
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4
-; CHECK-NEXT:    [[ADD9_US:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4
+; CHECK-NEXT:    [[ADD9_US:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
 ; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
@@ -319,17 +315,15 @@ define void @full_checks_diff_strides(ptr nocapture noundef %dst, ptr nocapture
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4, !alias.scope [[META16:![0-9]+]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META16]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP14]], align 4, !alias.scope [[META19]], !noalias [[META16]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP12]], align 4, !alias.scope [[META19]], !noalias [[META16]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
@@ -338,13 +332,13 @@ define void @full_checks_diff_strides(ptr nocapture noundef %dst, ptr nocapture
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP7]]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP8]]
-; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4
-; CHECK-NEXT:    [[ADD9_US:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP8]]
+; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4
+; CHECK-NEXT:    [[ADD9_US:%.*]] = add nsw i32 [[TMP18]], [[TMP16]]
 ; CHECK-NEXT:    store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
 ; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
@@ -435,15 +429,13 @@ define void @diff_checks_src_start_invariant(ptr nocapture noundef writeonly %ds
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
+; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
@@ -453,10 +445,10 @@ define void @diff_checks_src_start_invariant(ptr nocapture noundef writeonly %ds
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV_INNER]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP5]]
-; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP13]]
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX6_US]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[ARRAYIDX6_US]], align 4
 ; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
 ; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
 ; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
@@ -546,17 +538,15 @@ define void @full_checks_src_start_invariant(ptr nocapture noundef %dst, ptr noc
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META25:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META28:![0-9]+]], !noalias [[META25]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope [[META28]], !noalias [[META25]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META25:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META28:![0-9]+]], !noalias [[META25]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP7]], align 4, !alias.scope [[META28]], !noalias [[META25]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
@@ -566,11 +556,11 @@ define void @full_checks_src_start_invariant(ptr nocapture noundef %dst, ptr noc
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV_INNER]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP4]]
-; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4
-; CHECK-NEXT:    [[ADD7_US:%.*]] = add nsw i32 [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP4]]
+; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4
+; CHECK-NEXT:    [[ADD7_US:%.*]] = add nsw i32 [[TMP12]], [[TMP10]]
 ; CHECK-NEXT:    store i32 [[ADD7_US]], ptr [[ARRAYIDX6_US]], align 4
 ; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
 ; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
@@ -690,17 +680,15 @@ define void @triple_nested_loop_mixed_access(ptr nocapture noundef %dst, ptr noc
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[INDEX]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4, !alias.scope [[META32:![0-9]+]]
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP18]], [[INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META35:![0-9]+]], !noalias [[META32]]
-; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META35]], !noalias [[META32]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META32:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP18]], [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META35:![0-9]+]], !noalias [[META32]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP22]], align 4, !alias.scope [[META35]], !noalias [[META32]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_END]], label [[SCALAR_PH]]
@@ -709,13 +697,13 @@ define void @triple_nested_loop_mixed_access(ptr nocapture noundef %dst, ptr noc
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[INNER_IV]], [[TMP15]]
-; CHECK-NEXT:    [[ARRAYIDX_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX_US_US_US]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = add nuw nsw i64 [[TMP18]], [[INNER_IV]]
-; CHECK-NEXT:    [[ARRAYIDX17_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX17_US_US_US]], align 4
-; CHECK-NEXT:    [[ADD18_US_US_US:%.*]] = add nsw i32 [[TMP30]], [[TMP28]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nuw nsw i64 [[INNER_IV]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX_US_US_US]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[TMP18]], [[INNER_IV]]
+; CHECK-NEXT:    [[ARRAYIDX17_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX17_US_US_US]], align 4
+; CHECK-NEXT:    [[ADD18_US_US_US:%.*]] = add nsw i32 [[TMP28]], [[TMP26]]
 ; CHECK-NEXT:    store i32 [[ADD18_US_US_US]], ptr [[ARRAYIDX17_US_US_US]], align 4
 ; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -847,17 +835,15 @@ define void @uncomputable_outer_tc(ptr nocapture noundef %dst, ptr nocapture nou
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4, !alias.scope [[META39:![0-9]+]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META39]]
-; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP17]], align 4, !alias.scope [[META42]], !noalias [[META39]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4, !alias.scope [[META39:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META39]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP15]], align 4, !alias.scope [[META42]], !noalias [[META39]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
@@ -866,13 +852,13 @@ define void @uncomputable_outer_tc(ptr nocapture noundef %dst, ptr nocapture nou
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[INNER_IV]], [[TMP10]]
-; CHECK-NEXT:    [[ARRAYIDX5_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX5_US]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = add nsw i64 [[INNER_IV]], [[TMP11]]
-; CHECK-NEXT:    [[ARRAYIDX10_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX10_US]], align 4
-; CHECK-NEXT:    [[ADD11_US:%.*]] = add nsw i32 [[TMP23]], [[TMP21]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw i64 [[INNER_IV]], [[TMP10]]
+; CHECK-NEXT:    [[ARRAYIDX5_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX5_US]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[INNER_IV]], [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX10_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX10_US]], align 4
+; CHECK-NEXT:    [[ADD11_US:%.*]] = add nsw i32 [[TMP21]], [[TMP19]]
 ; CHECK-NEXT:    store i32 [[ADD11_US]], ptr [[ARRAYIDX10_US]], align 4
 ; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -880,8 +866,8 @@ define void @uncomputable_outer_tc(ptr nocapture noundef %dst, ptr nocapture nou
 ; CHECK:       inner.loop.exit:
 ; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i8, ptr [[STR]], i64 [[OUTER_IV_NEXT]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
-; CHECK-NEXT:    [[CMP_NOT_US:%.*]] = icmp eq i8 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
+; CHECK-NEXT:    [[CMP_NOT_US:%.*]] = icmp eq i8 [[TMP22]], 0
 ; CHECK-NEXT:    br i1 [[CMP_NOT_US]], label [[WHILE_END_LOOPEXIT:%.*]], label [[OUTER_LOOP]]
 ; CHECK:       while.end.loopexit:
 ; CHECK-NEXT:    br label [[WHILE_END]]
@@ -1176,17 +1162,15 @@ define void @decreasing_outer_iv(ptr nocapture noundef %dst, ptr nocapture nound
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nsw i64 [[INDEX]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP21]], align 4, !alias.scope [[META53:![0-9]+]]
-; CHECK-NEXT:    [[TMP22:%.*]] = add nsw i64 [[INDEX]], [[TMP16]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP24]], align 4, !alias.scope [[META56:![0-9]+]], !noalias [[META53]]
-; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store <4 x i32> [[TMP25]], ptr [[TMP24]], align 4, !alias.scope [[META56]], !noalias [[META53]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META53:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add nsw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META56:![0-9]+]], !noalias [[META53]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP22]], align 4, !alias.scope [[META56]], !noalias [[META53]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
@@ -1195,13 +1179,13 @@ define void @decreasing_outer_iv(ptr nocapture noundef %dst, ptr nocapture nound
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = add nsw i64 [[INNER_IV]], [[TMP15]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = add nsw i64 [[INNER_IV]], [[TMP16]]
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP30]], [[TMP28]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw i64 [[INNER_IV]], [[TMP15]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = add nsw i64 [[INNER_IV]], [[TMP16]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP28]], [[TMP26]]
 ; CHECK-NEXT:    store i32 [[ADD9]], ptr [[ARRAYIDX8]], align 4
 ; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -1330,17 +1314,15 @@ define void @unknown_inner_stride(ptr nocapture noundef %dst, ptr nocapture noun
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope [[META60:![0-9]+]]
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i64 [[INDEX]], [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4, !alias.scope [[META63:![0-9]+]], !noalias [[META60]]
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
-; CHECK-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP19]], align 4, !alias.scope [[META63]], !noalias [[META60]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META60:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[INDEX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP16]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4, !alias.scope [[META63:![0-9]+]], !noalias [[META60]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP17]], align 4, !alias.scope [[META63]], !noalias [[META60]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
@@ -1349,15 +1331,15 @@ define void @unknown_inner_stride(ptr nocapture noundef %dst, ptr nocapture noun
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       inner.loop:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP23:%.*]] = add nsw i64 [[TMP22]], [[TMP11]]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP1]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP25]], [[TMP12]]
-; CHECK-NEXT:    [[ARRAYIDX11_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX11_US]], align 4
-; CHECK-NEXT:    [[ADD12_US:%.*]] = add nsw i32 [[TMP27]], [[TMP24]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add nsw i64 [[TMP20]], [[TMP11]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP1]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw i64 [[TMP23]], [[TMP12]]
+; CHECK-NEXT:    [[ARRAYIDX11_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX11_US]], align 4
+; CHECK-NEXT:    [[ADD12_US:%.*]] = add nsw i32 [[TMP25]], [[TMP22]]
 ; CHECK-NEXT:    store i32 [[ADD12_US]], ptr [[ARRAYIDX11_US]], align 4
 ; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -1446,15 +1428,13 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[INDEX]], [[MUL]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 10)
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
@@ -1538,15 +1518,14 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]]
+; CHECK-NEXT:    store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP0]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[OUTER_LATCH]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[OUTER_LATCH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_HEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[INNER:%.*]]
 ; CHECK:       inner:
 ; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 23a3fc1..bfc0a48 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -48,13 +48,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
-; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
 ; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP20]]
-; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
-; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP23]], align 4
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP21]], align 4
 ; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
 ; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -115,22 +113,20 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
-; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]]
-; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP18]], align 4
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i32>, ptr [[TMP22]], align 4
 ; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD3]], i32 -1)
 ; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP23]]
 ; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD3]], [[TMP24]]
-; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP29]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP30]]
-; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP26]], ptr [[TMP28]], align 4
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP26]], ptr [[TMP25]], align 4
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP27]], ptr [[TMP31]], align 4
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
 ; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -214,8 +210,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
-; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
 ; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP12]]
 ; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = icmp sgt <vscale x 4 x i32> [[TMP13]], zeroinitializer
@@ -277,11 +272,10 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD2]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
 ; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD2]], i32 -1)
@@ -409,16 +403,14 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
-; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP20]], align 2, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP19]], align 2, !alias.scope [[META6:![0-9]+]]
 ; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = sitofp <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x double>
 ; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = sitofp <vscale x 4 x i16> [[TMP21]] to <vscale x 4 x double>
 ; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = fmul fast <vscale x 4 x double> [[TMP23]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = fsub fast <vscale x 4 x double> [[TMP22]], [[TMP24]]
 ; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i32 0
-; CHECK-VF4UF1-NEXT:    store <vscale x 4 x double> [[TMP25]], ptr [[TMP27]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x double> [[TMP25]], ptr [[TMP26]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
 ; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
 ; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -489,11 +481,10 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i64 [[TMP22]]
-; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP20]], align 2, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP19]], align 2, !alias.scope [[META6:![0-9]+]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD4]] = load <vscale x 4 x i16>, ptr [[TMP23]], align 2, !alias.scope [[META6]]
 ; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16> [[WIDE_LOAD4]], i32 -1)
@@ -506,11 +497,10 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = fsub fast <vscale x 4 x double> [[TMP26]], [[TMP30]]
 ; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = fsub fast <vscale x 4 x double> [[TMP27]], [[TMP31]]
 ; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[TMP37]]
-; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP32]], ptr [[TMP35]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP32]], ptr [[TMP34]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP33]], ptr [[TMP38]], align 8, !alias.scope [[META9]], !noalias [[META6]]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
 ; CHECK-VF4UF2-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -800,15 +790,13 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP12]]
-; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
-; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2, !alias.scope [[META17:![0-9]+]]
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP13]], align 2, !alias.scope [[META17:![0-9]+]]
 ; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i16> [[TMP15]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP20]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP19]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -861,11 +849,10 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP12]]
-; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2, !alias.scope [[META17:![0-9]+]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP13]], align 2, !alias.scope [[META17:![0-9]+]]
 ; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i16>, ptr [[TMP17]], align 2, !alias.scope [[META17]]
 ; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
 ; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16> [[WIDE_LOAD3]], i32 -1)
@@ -876,11 +863,10 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = mul nsw <vscale x 4 x i32> [[TMP22]], [[TMP20]]
 ; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 4 x i32> [[TMP23]], [[TMP21]]
 ; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
 ; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP29]]
-; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP24]], ptr [[TMP27]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP24]], ptr [[TMP26]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP25]], ptr [[TMP30]], align 4, !alias.scope [[META20]], !noalias [[META17]]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
index b0029a4..1ec2993 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
@@ -32,11 +32,10 @@ define i32 @iv_live_out_wide(ptr %dst) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP14]], align 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
index 4a1d7a2..7aac9d1 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
@@ -27,12 +27,11 @@ define void @test(ptr %d) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i32> splat (i32 100), ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> splat (i32 100), ptr [[TMP6]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -43,7 +42,7 @@ define void @test(ptr %d) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    store i32 100, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
@@ -98,12 +97,11 @@ define void @testloopvariant(ptr %d) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i32> splat (i32 100), ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> splat (i32 100), ptr [[TMP6]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 128, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -112,10 +110,10 @@ define void @testloopvariant(ptr %d) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr [1024 x i32], ptr [[ARR]], i32 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr [1024 x i32], ptr [[ARR]], i32 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    store i32 100, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4096, ptr [[ARR]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
index e901d98..ba337aa 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll
@@ -25,11 +25,10 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = and <vscale x 8 x i32> [[VEC_PHI]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <vscale x 8 x i32> [[VEC_PHI1]], splat (i32 255)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP11]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
index 16b0593..7811b17 100644
--- a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
@@ -24,9 +24,8 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; NO-IC:   %[[T4:.+]] = add nuw nsw i64 [[OFFSET_IDX]], %tmp0
 ; NO-IC:   %[[T6:.+]] = sub nsw i64 %[[T4]], %x
 ; NO-IC:   %[[T8:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T6]]
-; NO-IC:   %[[T10:.+]] = getelementptr inbounds i32, ptr %[[T8]], i32 0
 ; NO-IC:   %[[T12:.+]] = getelementptr inbounds i32, ptr %[[T8]], i32 4
-; NO-IC:   load <4 x i32>, ptr %[[T10]], align 4
+; NO-IC:   load <4 x i32>, ptr %[[T8]], align 4
 ; NO-IC:   load <4 x i32>, ptr %[[T12]], align 4
 ; NO-IC:   br {{.*}}, label %middle.block, label %vector.body
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
index a808c8b..70772dc 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
@@ -19,9 +19,9 @@ define void @test_pr63368(i1 %c, ptr %A) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT_1:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT_1:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_1_HEADER:%.*]]
 ; CHECK:       loop.1.header:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1_LATCH:%.*]] ]
@@ -57,11 +57,10 @@ define void @test_pr63368(i1 %c, ptr %A) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX5]] to i8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[A]], i8 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP12]], align 1
+; CHECK-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP11]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX5]], 4
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY4]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY4]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block7:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_2:%.*]], label [[SCALAR_PH2]]
diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
index a5cf45c..b2acc64 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
@@ -46,9 +46,9 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[LOOP]] ]
@@ -104,14 +104,13 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 30, [[DOTCAST]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr ptr, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1022, [[MIDDLE_BLOCK]] ], [ 30, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
@@ -124,7 +123,7 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr
 ; CHECK-NEXT:    store ptr [[P_0]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[INC]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV]], 1024
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -190,14 +189,12 @@ define void @implied_wrap_predicate(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[C]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -272,11 +269,10 @@ define void @no_signed_wrap_iv_via_btc(ptr %dst, i32 %N) mustprogress {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SUB4]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTER_LOOPEXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
index a4b2f0c..64e12cc 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll
@@ -31,8 +31,7 @@ define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4-IC1-NEXT:    [[TMP3:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC1-NEXT:    [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]]
 ; CHECK-VF4-IC1-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
@@ -89,9 +88,8 @@ define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; CHECK-VF4-IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
-; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-VF4-IC2-NEXT:    [[TMP6:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC2-NEXT:    [[TMP7:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD4]], zeroinitializer
@@ -252,8 +250,7 @@ define i32 @multi_user_cmp_int(ptr readonly %a, i64 noundef %n) {
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF4-IC1-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC1-NEXT:    [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]]
 ; CHECK-VF4-IC1-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
@@ -310,9 +307,8 @@ define i32 @multi_user_cmp_int(ptr readonly %a, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
 ; CHECK-VF4-IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4
-; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-VF4-IC2-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC2-NEXT:    [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD4]], zeroinitializer
@@ -484,8 +480,7 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_STORE_CONTINUE8]] ]
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[PRED_STORE_CONTINUE8]] ]
 ; CHECK-VF4-IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !alias.scope [[META6:![0-9]+]]
 ; CHECK-VF4-IC1-NEXT:    [[TMP4:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI2]], [[TMP4]]
 ; CHECK-VF4-IC1-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
@@ -598,9 +593,8 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) {
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE19]] ]
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[PRED_STORE_CONTINUE19]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
 ; CHECK-VF4-IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 4
-; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP3]], align 4, !alias.scope [[META6:![0-9]+]]
 ; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope [[META6]]
 ; CHECK-VF4-IC2-NEXT:    [[TMP7:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC2-NEXT:    [[TMP8:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD5]], zeroinitializer
@@ -893,8 +887,7 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4-IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4-IC1-NEXT:    [[TMP3:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC1-NEXT:    [[TMP4]] = or <4 x i1> [[VEC_PHI1]], [[TMP3]]
 ; CHECK-VF4-IC1-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true)
@@ -954,9 +947,8 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC2-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-VF4-IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-VF4-IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
 ; CHECK-VF4-IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4
-; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-VF4-IC2-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
 ; CHECK-VF4-IC2-NEXT:    [[TMP6:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-VF4-IC2-NEXT:    [[TMP7:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD4]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
index c17985d..8ab7ea8 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -16,8 +16,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF2IC1-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-VF2IC1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]]
-; CHECK-VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-VF2IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF2IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], splat (i32 35)
 ; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
 ; CHECK-VF2IC1-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index c5d3181..5e48b1f 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -17,8 +17,7 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -64,11 +63,10 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
@@ -209,8 +207,7 @@ define i32 @select_const_i32_from_icmp2(ptr %v, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP4]] = or <4 x i1> [[VEC_PHI]], [[TMP3]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -256,11 +253,10 @@ define i32 @select_const_i32_from_icmp2(ptr %v, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
@@ -397,8 +393,7 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -444,11 +439,10 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
@@ -589,8 +583,7 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -636,11 +629,10 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
@@ -781,8 +773,7 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
 ; CHECK-VF4IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC1-NEXT:    [[TMP4:%.*]] = fcmp one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-VF4IC1-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
 ; CHECK-VF4IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -828,11 +819,10 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[INDEX]]
-; CHECK-VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 4
 ; CHECK-VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 8
 ; CHECK-VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 12
-; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
 ; CHECK-VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll b/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll
index def239e..285c674 100644
--- a/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll
@@ -11,18 +11,17 @@ define void @neg_cond(ptr noalias %p, ptr noalias %q) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 42)
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> splat (i32 42), <4 x i32> splat (i32 43)
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index d51f64f..37d75ff 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -11,24 +11,23 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1)
-; CHECK-NEXT:    [[TMP3]] = or <2 x i1> [[VEC_PHI]], [[TMP2]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1)
+; CHECK-NEXT:    [[TMP2]] = or <2 x i1> [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze i1 [[TMP5]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 undef
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ undef, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
@@ -72,24 +71,23 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1)
-; CHECK-NEXT:    [[TMP3]] = or <2 x i1> [[VEC_PHI]], [[TMP2]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1)
+; CHECK-NEXT:    [[TMP2]] = or <2 x i1> [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze i1 [[TMP5]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 poison
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ poison, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
@@ -133,24 +131,23 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1)
-; CHECK-NEXT:    [[TMP3]] = or <2 x i1> [[VEC_PHI]], [[TMP2]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1)
+; CHECK-NEXT:    [[TMP2]] = or <2 x i1> [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze i1 [[TMP5]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 [[START]]
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 [[START]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[START]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll b/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll
index 56cfc31..e571f69 100644
--- a/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll
@@ -15,17 +15,14 @@ define void @select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, ptr noal
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+01)
 ; CHECK-NEXT:    [[TMP7:%.*]] = select fast <4 x i1> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
index de8a3c5..c648bed 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
@@ -1,13 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; REQUIRES: asserts
-; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 \
-; RUN:   -debug-only=loop-vectorize -S %s 2>%t | FileCheck --check-prefix=VF4IC4 %s
-; RUN: cat %t | FileCheck --check-prefix=DEBUG %s
+; RUN: opt -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 -S %s | FileCheck --check-prefix=VF4IC4 %s
 
 declare void @init_mem(ptr, i64);
 
-; DEBUG: Interleaving not supported for loops with uncountable early exits
-
 define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-NEXT:  [[ENTRY:.*]]:
@@ -19,22 +14,33 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4:       [[VECTOR_BODY]]:
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], splat (i32 10)
+; VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD2]], splat (i32 10)
+; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10)
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP2]], [[TMP6]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP7]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP8]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4IC4:       [[MIDDLE_SPLIT]]:
 ; VF4IC4-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF4IC4:       [[MIDDLE_BLOCK]]:
-; VF4IC4-NEXT:    br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
+; VF4IC4-NEXT:    br label %[[E2:.*]]
 ; VF4IC4:       [[VECTOR_EARLY_EXIT]]:
 ; VF4IC4-NEXT:    br label %[[E1:.*]]
 ; VF4IC4:       [[SCALAR_PH]]:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF4IC4-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF4IC4:       [[LOOP_HEADER]]:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index 0f99ed5..3f51c72 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -14,22 +14,33 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4:       vector.body:
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
-; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10)
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
+; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], splat (i32 10)
+; VF4IC4-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD2]], splat (i32 10)
+; VF4IC4-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10)
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP8]], [[TMP6]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP7]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP14]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP3]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 true, label [[E2:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[E2:%.*]]
 ; VF4IC4:       vector.early.exit:
 ; VF4IC4-NEXT:    br label [[E1:%.*]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; VF4IC4:       loop.header:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -88,28 +99,57 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]]
+; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
+; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP35]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -161,30 +201,53 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:    [[PTREND:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
 ; VF4IC4-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VF4IC4:       vector.ph:
-; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
 ; VF4IC4-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VF4IC4:       vector.body:
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], splat (i8 72)
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
-; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
-; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 4
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[NEXT_GEP]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
+; VF4IC4-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], splat (i8 72)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], splat (i8 72)
+; VF4IC4-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], splat (i8 72)
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP17]], [[TMP14]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = or <4 x i1> [[TMP13]], [[TMP28]]
+; VF4IC4-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP31]], [[TMP29]]
+; VF4IC4-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; VF4IC4-NEXT:    [[TMP12:%.*]] = or i1 [[TMP10]], [[TMP11]]
+; VF4IC4-NEXT:    br i1 [[TMP12]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4IC4:       middle.split:
-; VF4IC4-NEXT:    br i1 [[TMP3]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; VF4IC4-NEXT:    br i1 [[TMP10]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP6:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 true)
+; VF4IC4-NEXT:    [[TMP16:%.*]] = add i64 12, [[TMP15]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP28]], i1 true)
+; VF4IC4-NEXT:    [[TMP18:%.*]] = add i64 8, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 [[TMP16]]
+; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP14]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = add i64 4, [[TMP21]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP21]], 4
+; VF4IC4-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 [[TMP20]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
+; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP25]], 4
+; VF4IC4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP27]], i64 [[TMP26]], i64 [[TMP24]]
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[P1]], [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[P1]], [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -235,28 +298,57 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]]
+; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
+; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP35]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -314,28 +406,57 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]]
+; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
+; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP35]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -400,28 +521,57 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]]
+; VF4IC4-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP14]], [[TMP12]]
+; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP15]], [[TMP35]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
+; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
+; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
+; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -488,30 +638,73 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
+; VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 -3
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
+; VF4IC4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
+; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP27]], align 1
+; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
 ; VF4IC4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 -3
+; VF4IC4-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -4
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP41]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 -12
+; VF4IC4-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP46]], align 1
 ; VF4IC4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x i8>, ptr [[TMP42]], align 1
+; VF4IC4-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD15]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE14]], [[REVERSE15]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP21]])
-; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE10]]
+; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
+; VF4IC4-NEXT:    [[TMP43:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE16]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP21]], [[TMP19]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP43]]
+; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
+; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; VF4IC4-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP43]], i1 true)
+; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 12, [[TMP28]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 8, [[TMP30]]
+; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
+; VF4IC4-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
+; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP35:%.*]] = add i64 4, [[TMP34]]
+; VF4IC4-NEXT:    [[TMP36:%.*]] = icmp ne i64 [[TMP34]], 4
+; VF4IC4-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i64 [[TMP35]], i64 [[TMP33]]
+; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP39:%.*]] = add i64 0, [[TMP38]]
+; VF4IC4-NEXT:    [[TMP40:%.*]] = icmp ne i64 [[TMP38]], 4
+; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP40]], i64 [[TMP39]], i64 [[TMP37]]
 ; VF4IC4-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
 ; VF4IC4-NEXT:    [[TMP12:%.*]] = sub i64 1023, [[TMP11]]
-; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 15, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -526,7 +719,7 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
 ; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ], [ 1024, [[MIDDLE_BLOCK]] ], [ [[TMP12]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ], [ [[TMP12]], [[VECTOR_EARLY_EXIT]] ]
 ; VF4IC4-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -569,27 +762,68 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4:       vector.body:
 ; VF4IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; VF4IC4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
-; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
-; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+; VF4IC4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 4
+; VF4IC4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 12
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; VF4IC4-NEXT:    [[TMP29:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; VF4IC4-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD9]]
+; VF4IC4-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP12]], [[TMP29]]
+; VF4IC4-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP13]], [[TMP11]]
+; VF4IC4-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP17]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
-; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD3]], i64 [[FIRST_ACTIVE_LANE1]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE8]]
+; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE8]], 4
+; VF4IC4-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP20]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 true)
+; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE9]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE9]], 4
+; VF4IC4-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 [[TMP23]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE1]]
+; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE1]], 4
+; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP26]]
+; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[WIDE_LOAD2]], i64 [[TMP8]]
+; VF4IC4-NEXT:    [[TMP31:%.*]] = sub i64 [[TMP8]], 4
+; VF4IC4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i8> [[WIDE_LOAD4]], i64 [[TMP31]]
+; VF4IC4-NEXT:    [[TMP33:%.*]] = icmp uge i64 [[TMP8]], 4
+; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i8 [[TMP32]], i8 [[EARLY_EXIT_VALUE]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = sub i64 [[TMP8]], 8
+; VF4IC4-NEXT:    [[TMP36:%.*]] = extractelement <4 x i8> [[WIDE_LOAD5]], i64 [[TMP35]]
+; VF4IC4-NEXT:    [[TMP37:%.*]] = icmp uge i64 [[TMP8]], 8
+; VF4IC4-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i8 [[TMP36]], i8 [[TMP34]]
+; VF4IC4-NEXT:    [[TMP39:%.*]] = sub i64 [[TMP8]], 12
+; VF4IC4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i8> [[WIDE_LOAD3]], i64 [[TMP39]]
+; VF4IC4-NEXT:    [[TMP41:%.*]] = icmp uge i64 [[TMP8]], 12
+; VF4IC4-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i8 [[TMP40]], i8 [[TMP38]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -604,7 +838,7 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP42]], [[VECTOR_EARLY_EXIT]] ]
 ; VF4IC4-NEXT:    ret i8 [[RETVAL]]
 ;
 entry:
@@ -650,29 +884,84 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 -3
+; VF4IC4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -4
+; VF4IC4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3
+; VF4IC4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -8
+; VF4IC4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 -3
+; VF4IC4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 -12
+; VF4IC4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i8>, ptr [[TMP8]], align 1
 ; VF4IC4-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD5]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; VF4IC4-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
+; VF4IC4-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD3]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; VF4IC4-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD6]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
 ; VF4IC4-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
 ; VF4IC4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP38]], i32 -3
+; VF4IC4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -4
+; VF4IC4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 -3
+; VF4IC4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -8
+; VF4IC4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 -3
+; VF4IC4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 -12
+; VF4IC4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 -3
 ; VF4IC4-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; VF4IC4-NEXT:    [[REVERSE10:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD9]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; VF4IC4-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD11]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF4IC4-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
+; VF4IC4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
-; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP21]])
-; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
+; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[REVERSE2]], [[REVERSE10]]
+; VF4IC4-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i8> [[REVERSE4]], [[REVERSE12]]
+; VF4IC4-NEXT:    [[TMP37:%.*]] = icmp ne <4 x i8> [[REVERSE7]], [[REVERSE15]]
+; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF4IC4-NEXT:    [[TMP22:%.*]] = or <4 x i1> [[TMP21]], [[TMP19]]
+; VF4IC4-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP22]], [[TMP20]]
+; VF4IC4-NEXT:    [[TMP24:%.*]] = or <4 x i1> [[TMP23]], [[TMP37]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP24]])
+; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = or i1 [[TMP25]], [[TMP26]]
 ; VF4IC4-NEXT:    br i1 [[TMP27]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4IC4:       middle.split:
 ; VF4IC4-NEXT:    br i1 [[TMP25]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; VF4IC4:       middle.block:
-; VF4IC4-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; VF4IC4-NEXT:    br label [[SCALAR_PH]]
 ; VF4IC4:       vector.early.exit:
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP37]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE15]]
+; VF4IC4-NEXT:    [[TMP30:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE15]], 4
+; VF4IC4-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i64 [[TMP29]], i64 [[TMP28]]
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE16]]
+; VF4IC4-NEXT:    [[TMP33:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE16]], 4
+; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i64 [[TMP32]], i64 [[TMP31]]
 ; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
-; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE6]], i64 [[FIRST_ACTIVE_LANE1]]
-; VF4IC4-NEXT:    br label [[LOOP_END]]
+; VF4IC4-NEXT:    [[TMP35:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE1]]
+; VF4IC4-NEXT:    [[TMP36:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE1]], 4
+; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP36]], i64 [[TMP35]], i64 [[TMP34]]
+; VF4IC4-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i8> [[REVERSE6]], i64 [[TMP10]]
+; VF4IC4-NEXT:    [[TMP39:%.*]] = sub i64 [[TMP10]], 4
+; VF4IC4-NEXT:    [[TMP40:%.*]] = extractelement <4 x i8> [[REVERSE2]], i64 [[TMP39]]
+; VF4IC4-NEXT:    [[TMP41:%.*]] = icmp uge i64 [[TMP10]], 4
+; VF4IC4-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i8 [[TMP40]], i8 [[EARLY_EXIT_VALUE]]
+; VF4IC4-NEXT:    [[TMP43:%.*]] = sub i64 [[TMP10]], 8
+; VF4IC4-NEXT:    [[TMP44:%.*]] = extractelement <4 x i8> [[REVERSE4]], i64 [[TMP43]]
+; VF4IC4-NEXT:    [[TMP45:%.*]] = icmp uge i64 [[TMP10]], 8
+; VF4IC4-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], i8 [[TMP44]], i8 [[TMP42]]
+; VF4IC4-NEXT:    [[TMP47:%.*]] = sub i64 [[TMP10]], 12
+; VF4IC4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i8> [[REVERSE7]], i64 [[TMP47]]
+; VF4IC4-NEXT:    [[TMP49:%.*]] = icmp uge i64 [[TMP10]], 12
+; VF4IC4-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i8 [[TMP48]], i8 [[TMP46]]
+; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; VF4IC4-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 15, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
 ; VF4IC4:       loop:
 ; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -687,7 +976,7 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 0
 ; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ [[TMP50]], [[VECTOR_EARLY_EXIT]] ]
 ; VF4IC4-NEXT:    ret i8 [[RETVAL]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index d5a206f..842ff91 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -19,21 +19,19 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i16
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i16> splat (i16 1), <2 x i16> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI]], ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -103,24 +101,22 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [32 x i16], ptr @src, i16 0, i16 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1)
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI1]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI1]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -200,8 +196,7 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP7]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <2 x i16> [[TMP9]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
@@ -209,9 +204,9 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -301,16 +296,15 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> [[TMP14]], <2 x i16> splat (i16 1)
 ; CHECK-NEXT:    [[PREDPHI3:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0
-; CHECK-NEXT:    store <2 x i16> [[PREDPHI3]], ptr [[TMP19]], align 2
+; CHECK-NEXT:    store <2 x i16> [[PREDPHI3]], ptr [[TMP18]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -377,16 +371,15 @@ define void @duplicated_incoming_blocks_blend(i32 %x, ptr %ptr) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[LOOP_LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 2069570..2c0a6f1 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -18,11 +18,9 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -32,11 +30,11 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -95,11 +93,9 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -109,11 +105,11 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -193,11 +189,9 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
@@ -294,11 +288,11 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 266, [[MIDDLE_BLOCK]] ], [ -10, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index a21666a..940e398 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -17,11 +17,9 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -31,14 +29,14 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -97,11 +95,9 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -111,7 +107,7 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
@@ -120,8 +116,8 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i32 9, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 137, [[MIDDLE_BLOCK]] ], [ 9, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ 9, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -177,14 +173,12 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[P1]], i64 64
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i128 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i128 [[INDEX1]] to i64
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 3)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i128 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
@@ -194,7 +188,7 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i64 [[FIRST_ACTIVE_LANE]] to i128
@@ -204,9 +198,9 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() {
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i32 9, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i128 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ 137, [[MIDDLE_BLOCK]] ], [ 9, [[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[P1]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i128 [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ 9, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[P1]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i128 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -265,11 +259,9 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -279,7 +271,7 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
@@ -288,8 +280,8 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() {
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = fadd fast float 9.000000e+00, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi float [ 7.300000e+01, [[MIDDLE_BLOCK]] ], [ 9.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi float [ 9.000000e+00, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -347,17 +339,14 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[P2]], i64 320
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]])
@@ -367,7 +356,7 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
@@ -375,8 +364,8 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() {
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP20]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[P2]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[P2]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -433,13 +422,11 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; CHECK-NEXT:    [[PTREND:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
@@ -449,14 +436,14 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[P1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[P1]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -509,11 +496,9 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[COND]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
@@ -525,14 +510,14 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -593,11 +578,9 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -607,14 +590,14 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -673,11 +656,9 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -687,14 +668,14 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -753,11 +734,9 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -767,11 +746,11 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -830,11 +809,9 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -844,14 +821,14 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -912,8 +889,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
@@ -924,14 +900,14 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -986,11 +962,9 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
@@ -1000,14 +974,14 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1060,13 +1034,11 @@ define ptr @same_exit_block_post_inc_use1_ivptr() {
 ; CHECK-NEXT:    [[PTREND:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[P1]], i64 1024
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]])
@@ -1076,7 +1048,7 @@ define ptr @same_exit_block_post_inc_use1_ivptr() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
@@ -1084,7 +1056,7 @@ define ptr @same_exit_block_post_inc_use1_ivptr() {
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[P1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[P1]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1136,11 +1108,9 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
@@ -1150,7 +1120,7 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
@@ -1158,7 +1128,7 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1217,11 +1187,9 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -1231,14 +1199,14 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1304,11 +1272,9 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -1318,11 +1284,11 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1388,11 +1354,9 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -1402,14 +1366,14 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1473,11 +1437,9 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
@@ -1487,14 +1449,14 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1560,11 +1522,9 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
@@ -1574,7 +1534,7 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
@@ -1582,7 +1542,7 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1649,11 +1609,9 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
@@ -1664,7 +1622,7 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) {
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i64 [[TMP0]], 1
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
@@ -1672,8 +1630,8 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) {
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 [[START]], [[TMP12]]
 ; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[START]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1743,8 +1701,7 @@ define i64 @loop_contains_safe_call() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast oge <4 x float> [[TMP3]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
@@ -1755,14 +1712,14 @@ define i64 @loop_contains_safe_call() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1819,8 +1776,7 @@ define i64 @loop_contains_safe_div() {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <4 x i32> [[WIDE_LOAD]], splat (i32 20000)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP3]], splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
@@ -1831,14 +1787,14 @@ define i64 @loop_contains_safe_div() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1894,12 +1850,10 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
@@ -1909,14 +1863,14 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_LOAD2]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -1991,12 +1945,12 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = sub i64 1023, [[TMP12]]
-; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
@@ -2013,7 +1967,7 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP1]], !llvm.loop [[LOOP51:![0-9]+]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ 1024, [[LOOP_INC]] ], [ 1024, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ 1024, [[LOOP_INC]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -2104,11 +2058,9 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
@@ -2118,14 +2070,14 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
index 5c1bde0..604e6a8 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
@@ -92,7 +92,7 @@ loop.outer.latch:
 
 define i32 @early_exit_branch_to_outer_header() {
 ; CHECK-LABEL: Loop info for function 'early_exit_branch_to_outer_header':
-; CHECK-NEXT:  Loop at depth 1 containing: %outer.header<header>,%loop.header,%loop.latch<exiting>,%outer.header.loopexit<latch>,%scalar.ph,%vector.ph,%vector.body,%middle.split,%middle.block<exiting>,%vector.early.exit
+; CHECK-NEXT:  Loop at depth 1 containing: %outer.header<header>,%loop.header,%loop.latch<exiting>,%outer.header.loopexit<latch>,%scalar.ph,%vector.ph,%vector.body,%middle.split<exiting>,%vector.early.exit
 ; CHECK-NEXT:    Loop at depth 2 containing: %loop.header<header><exiting>,%loop.latch<latch><exiting>
 ; CHECK-NEXT:    Loop at depth 2 containing: %vector.body<header><latch><exiting>
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
index 36dbc96..9c14a8c 100644
--- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
@@ -46,8 +46,7 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[L_1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP5]], align 2, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP8]], ptr [[L_2]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll
index 1c5a783..971921a 100644
--- a/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll
@@ -21,9 +21,9 @@ define void @test_variable_stride(ptr %dst, i32 %scale) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
index 717d1f9..1782086 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -12,8 +12,7 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
 ; VF4-NEXT:    [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
@@ -39,8 +38,7 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x i64> } [[TMP20]], <4 x i64> [[TMP23]], 0
 ; VF4-NEXT:    [[TMP25:%.*]] = extractvalue { <4 x i64> } [[TMP24]], 0
 ; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[TMP26]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP25]], ptr [[TMP27]], align 4
+; VF4-NEXT:    store <4 x i64> [[TMP25]], ptr [[TMP26]], align 4
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -55,9 +53,8 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2:       [[VECTOR_BODY]]:
 ; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
-; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
 ; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]]
@@ -84,9 +81,8 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP25:%.*]] = extractvalue { <2 x i64> } [[TMP13]], 0
 ; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { <2 x i64> } [[TMP24]], 0
 ; VF2IC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[OUT_A]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 0
 ; VF2IC2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i32 2
-; VF2IC2-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP28]], align 4
+; VF2IC2-NEXT:    store <2 x i64> [[TMP25]], ptr [[TMP27]], align 4
 ; VF2IC2-NEXT:    store <2 x i64> [[TMP26]], ptr [[TMP29]], align 4
 ; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2IC2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -122,8 +118,7 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
 ; VF4-NEXT:    [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
@@ -166,11 +161,9 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 0
 ; VF4-NEXT:    [[TMP42:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP40]], 1
 ; VF4-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP43]], i32 0
-; VF4-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP44]], align 4
+; VF4-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP43]], align 4
 ; VF4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
-; VF4-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP46]], align 4
+; VF4-NEXT:    store <4 x float> [[TMP42]], ptr [[TMP45]], align 4
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4-NEXT:    br i1 [[TMP47]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -185,9 +178,8 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2:       [[VECTOR_BODY]]:
 ; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 0
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
-; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
 ; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]]
@@ -232,14 +224,12 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP43:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP21]], 1
 ; VF2IC2-NEXT:    [[TMP44:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP40]], 1
 ; VF2IC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 0
 ; VF2IC2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i32 2
-; VF2IC2-NEXT:    store <2 x float> [[TMP41]], ptr [[TMP46]], align 4
+; VF2IC2-NEXT:    store <2 x float> [[TMP41]], ptr [[TMP45]], align 4
 ; VF2IC2-NEXT:    store <2 x float> [[TMP42]], ptr [[TMP47]], align 4
 ; VF2IC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0
 ; VF2IC2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 2
-; VF2IC2-NEXT:    store <2 x float> [[TMP43]], ptr [[TMP49]], align 4
+; VF2IC2-NEXT:    store <2 x float> [[TMP43]], ptr [[TMP48]], align 4
 ; VF2IC2-NEXT:    store <2 x float> [[TMP44]], ptr [[TMP50]], align 4
 ; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2IC2-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -279,8 +269,7 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4:       [[VECTOR_BODY]]:
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
 ; VF4-NEXT:    [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
@@ -338,16 +327,13 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP56:%.*]] = insertvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP52]], <4 x i32> [[TMP55]], 2
 ; VF4-NEXT:    [[TMP57:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 0
 ; VF4-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 0
-; VF4-NEXT:    store <4 x i32> [[TMP57]], ptr [[TMP59]], align 4
+; VF4-NEXT:    store <4 x i32> [[TMP57]], ptr [[TMP58]], align 4
 ; VF4-NEXT:    [[TMP60:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 1
 ; VF4-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 0
-; VF4-NEXT:    store <4 x i32> [[TMP60]], ptr [[TMP62]], align 4
+; VF4-NEXT:    store <4 x i32> [[TMP60]], ptr [[TMP61]], align 4
 ; VF4-NEXT:    [[TMP63:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP56]], 2
 ; VF4-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
-; VF4-NEXT:    store <4 x i32> [[TMP63]], ptr [[TMP65]], align 4
+; VF4-NEXT:    store <4 x i32> [[TMP63]], ptr [[TMP64]], align 4
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4-NEXT:    br i1 [[TMP66]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -362,9 +348,8 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2:       [[VECTOR_BODY]]:
 ; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
-; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
 ; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
@@ -423,23 +408,20 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP57:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 0
 ; VF2IC2-NEXT:    [[TMP58:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 0
 ; VF2IC2-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[DST_A]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 0
 ; VF2IC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 2
-; VF2IC2-NEXT:    store <2 x i32> [[TMP57]], ptr [[TMP60]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP57]], ptr [[TMP59]], align 4
 ; VF2IC2-NEXT:    store <2 x i32> [[TMP58]], ptr [[TMP61]], align 4
 ; VF2IC2-NEXT:    [[TMP62:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 1
 ; VF2IC2-NEXT:    [[TMP63:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 1
 ; VF2IC2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[DST_B]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
 ; VF2IC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 2
-; VF2IC2-NEXT:    store <2 x i32> [[TMP62]], ptr [[TMP65]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP62]], ptr [[TMP64]], align 4
 ; VF2IC2-NEXT:    store <2 x i32> [[TMP63]], ptr [[TMP66]], align 4
 ; VF2IC2-NEXT:    [[TMP67:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP29]], 2
 ; VF2IC2-NEXT:    [[TMP68:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP56]], 2
 ; VF2IC2-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[DST_C]], i64 [[INDEX]]
-; VF2IC2-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 0
 ; VF2IC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2
-; VF2IC2-NEXT:    store <2 x i32> [[TMP67]], ptr [[TMP70]], align 4
+; VF2IC2-NEXT:    store <2 x i32> [[TMP67]], ptr [[TMP69]], align 4
 ; VF2IC2-NEXT:    store <2 x i32> [[TMP68]], ptr [[TMP71]], align 4
 ; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF2IC2-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
index adc4c8e..efc2b8d 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
@@ -30,8 +30,8 @@ define void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p) {
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -91,8 +91,8 @@ define void @canonical_upper_limit_i8(ptr nocapture noundef writeonly %p) {
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -213,8 +213,8 @@ define void @canonical_upper_limit_i16(ptr nocapture noundef writeonly %p) {
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -335,8 +335,8 @@ define void @canonical_upper_limit_i32(ptr nocapture noundef writeonly %p) {
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
 ; CHECK:       [[SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
index 514e858..586804b 100644
--- a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
+++ b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
@@ -23,8 +23,7 @@ define i64 @multi_exit_1_exit_count_with_udiv_by_value_in_header(ptr %dst, i64 %
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -88,8 +87,7 @@ define i64 @multi_exit_1_exit_count_with_udiv_by_constant_in_header(ptr %dst, i6
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -153,8 +151,7 @@ define i64 @multi_exit_2_exit_count_with_udiv_by_value_in_block_executed_uncondi
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -265,8 +262,7 @@ define i64 @multi_exit_2_exit_count_with_udiv_by_constant_in_block_executed_unco
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -478,8 +474,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(ptr %dst, i64 %N
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -549,8 +544,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -620,8 +614,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_exe
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -691,8 +684,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP9]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -759,8 +751,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst,
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -826,8 +817,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_constant_in_latch(ptr %dst, i64
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[GEP]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -887,8 +877,7 @@ define void @single_exit_tc_with_udiv(ptr %dst, i64 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -949,8 +938,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP8]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
@@ -1015,8 +1003,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[GEP]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]]
@@ -1164,8 +1151,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
@@ -1276,8 +1262,7 @@ define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_diviso
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x i32> splat (i32 1), ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll b/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll
index 26d8e37..a687ecc3 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll
@@ -15,13 +15,12 @@ define i32 @test_icmp_constant_op_zext(ptr %dst) {
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 1, [[DOTCAST]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x i8> splat (i8 109), ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <4 x i8> splat (i8 109), ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -35,7 +34,7 @@ define i32 @test_icmp_constant_op_zext(ptr %dst) {
 ; CHECK-NEXT:    store i8 [[OR_TRUNC]], ptr [[GEP]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -78,13 +77,12 @@ define i32 @test_icmp_and_op_zext(ptr %dst, i64 %a) {
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 1, [[DOTCAST]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP4]], ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -98,7 +96,7 @@ define i32 @test_icmp_and_op_zext(ptr %dst, i64 %a) {
 ; CHECK-NEXT:    store i8 [[OR_TRUNC]], ptr [[GEP]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -132,25 +130,22 @@ define void @ext_cmp(ptr %src.1, ptr %src.2, ptr noalias %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[SRC_1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i16> zeroinitializer, [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[SRC_2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP3]], <4 x i16> zeroinitializer, <4 x i16> [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
index 956a134..66dc785 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
@@ -17,20 +17,18 @@ define void @pr77468(ptr noalias %src, ptr noalias %dst, i1 %x) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[SRC]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i32> [[WIDE_LOAD]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i16> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll b/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
index 3afd855..a592312 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
@@ -19,7 +19,7 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> [[TMP2]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -66,7 +66,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP2]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -113,7 +113,7 @@ define i16 @reduction_xor_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> [[TMP2]])
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
@@ -325,7 +325,7 @@ define i32 @reduction_and_or(i16 %a, i32 %b, ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -338,10 +338,9 @@ define i32 @reduction_and_or(i16 %a, i32 %b, ptr %src) {
 ; CHECK-NEXT:    [[OR]] = or i32 [[OR67]], [[L]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 999
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP]] ], [ poison, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[OR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[OR]]
 ;
 entry:
   %ext1 = zext i16 %a to i32
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
index b94bd90..10e9ae8 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
@@ -19,15 +19,14 @@ define void @test_pr47927_lshr_const_shift_ops(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -78,15 +77,14 @@ define void @test_shl_const_shift_ops(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -137,15 +135,14 @@ define void @test_ashr_const_shift_ops(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i8> [[TMP1]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -192,19 +189,18 @@ define void @test_shl_const_shifted_op(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> splat (i32 19), [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i8>
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -256,19 +252,18 @@ define void @test_lshr_by_18(ptr %A) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], splat (i32 18)
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i32> [[TMP5]] to <4 x i8>
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -319,19 +314,18 @@ define void @test_lshr_by_4(ptr %A) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i16> [[TMP4]], splat (i16 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i16> [[TMP5]] to <4 x i8>
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
index 24d099c..c678175 100644
--- a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
@@ -16,16 +16,15 @@ define void @uitofp_preserve_nneg(ptr %result, i32 %size, float %y) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP0]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[INDEX:%.*]] = zext nneg i32 [[INDEX1]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[RESULT:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX1]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[FOR_BODY_PREHEADER4]]
+; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER4]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 67d99dc..85cf925 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -15,15 +15,14 @@ define void @blend_uniform_iv_trunc(i1 %c) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[C]], i16 [[TMP0]], i16 poison
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[TMP6]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP3]], align 2
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -75,15 +74,14 @@ define void @blend_uniform_iv(i1 %c) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[C]], i64 [[INDEX]], i64 poison
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP2]], align 2
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -156,9 +154,9 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 3aad626..82f2fdd4 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -15,17 +15,15 @@ define void @ld_div1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -63,13 +61,12 @@ define void @ld_div2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -111,14 +108,13 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -173,7 +169,7 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -208,20 +204,19 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -276,7 +271,7 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -331,7 +326,7 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -386,7 +381,7 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -421,20 +416,19 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -467,17 +461,15 @@ define void @ld_div1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -520,14 +512,13 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -570,14 +561,13 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -633,7 +623,7 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -669,20 +659,19 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    store i64 [[TMP10]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -738,7 +727,7 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -794,7 +783,7 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -850,7 +839,7 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -886,20 +875,19 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    store i64 [[TMP10]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -947,7 +935,7 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 56
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index 1f331a4..af2b238 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -15,17 +15,8 @@ define void @ld_and_neg1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and i64 [[INDEX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -63,13 +54,8 @@ define void @ld_and_neg2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -111,14 +97,8 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -173,7 +153,7 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -218,7 +198,7 @@ define void @ld_and_neg2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -273,7 +253,7 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -328,7 +308,7 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -371,14 +351,8 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -434,7 +408,7 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -490,7 +464,7 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -546,7 +520,7 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index ea8831c..61f511c 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -51,14 +51,13 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i64> [[TMP32]], i64 [[TMP25]], i32 7
 ; CHECK-NEXT:    [[TMP34:%.*]] = add nsw <8 x i64> [[TMP33]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i64, ptr [[TMP35]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[TMP34]], ptr [[TMP36]], align 8
+; CHECK-NEXT:    store <8 x i64> [[TMP34]], ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -127,14 +126,13 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i64> [[TMP33]], i64 [[TMP26]], i32 7
 ; CHECK-NEXT:    [[TMP35:%.*]] = add nsw <8 x i64> [[TMP34]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i64, ptr [[TMP36]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[TMP35]], ptr [[TMP37]], align 8
+; CHECK-NEXT:    store <8 x i64> [[TMP35]], ptr [[TMP36]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -201,14 +199,13 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[TMP24]], i32 7
 ; CHECK-NEXT:    [[TMP33:%.*]] = add nsw <8 x i64> [[TMP32]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[TMP33]], ptr [[TMP35]], align 8
+; CHECK-NEXT:    store <8 x i64> [[TMP33]], ptr [[TMP34]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -247,13 +244,12 @@ define void @ld_div8_urem3(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index 1f33f7a..e412d13 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -16,17 +16,15 @@ define void @ld_lshr0_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = lshr i64 [[INDEX]], 0
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF2-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr0_step1_start0_ind1
@@ -39,17 +37,15 @@ define void @ld_lshr0_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = lshr i64 [[INDEX]], 0
 ; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; VF4-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP3]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -87,13 +83,12 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr1_step1_start0_ind1
@@ -124,14 +119,13 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 3
 ; VF4-NEXT:    [[TMP17:%.*]] = add nsw <4 x i64> [[TMP16]], splat (i64 42)
 ; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP18]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP19]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -169,13 +163,12 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr2_step1_start0_ind1
@@ -193,13 +186,12 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0
 ; VF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF4-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
+; VF4-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -254,7 +246,7 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr0_step2_start0_ind1
@@ -306,7 +298,7 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -341,20 +333,19 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP0]], 1
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; VF2-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; VF2-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
 ; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
+; VF2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr1_step2_start0_ind1
@@ -372,26 +363,25 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP0]], 1
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP7:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
+; VF4-NEXT:    store i64 [[TMP11]], ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
 ; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
 ; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
 ; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
+; VF4-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -446,7 +436,7 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr0_step3_start0_ind1
@@ -498,7 +488,7 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -553,7 +543,7 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr1_step3_start0_ind1
@@ -605,7 +595,7 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -649,14 +639,13 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
 ; VF2-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
 ; VF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP11]], align 8
+; VF2-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
-; VF2-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr1_step1_start1_ind1
@@ -688,14 +677,13 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 3
 ; VF4-NEXT:    [[TMP17:%.*]] = add nsw <4 x i64> [[TMP16]], splat (i64 42)
 ; VF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP18]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP19]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; VF4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -731,20 +719,19 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP1]], 1
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
 ; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
-; VF2-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; VF2-NEXT:    store i64 [[TMP10]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; VF2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
+; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr1_step2_start1_ind1
@@ -763,26 +750,25 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP1]], 1
 ; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP8:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
+; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
 ; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
+; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
 ; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
+; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
 ; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
-; VF4-NEXT:    store i64 [[TMP16]], ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
-; VF4-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
+; VF4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -838,7 +824,7 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr1_step3_start1_ind1
@@ -891,7 +877,7 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -947,7 +933,7 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_lshr2_step3_start1_ind1
@@ -1000,7 +986,7 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index ef62557..ef6ce08 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -27,15 +27,14 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
-; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div1_step1_start0_ind2
@@ -69,15 +68,14 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
 ; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
-; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -121,13 +119,12 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div2_step1_start0_ind2
@@ -161,15 +158,14 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
 ; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
-; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -218,15 +214,14 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
-; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div3_step1_start0_ind2
@@ -260,15 +255,14 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
 ; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
-; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -331,7 +325,7 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div1_step2_start0_ind2
@@ -387,7 +381,7 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -450,7 +444,7 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div2_step2_start0_ind2
@@ -506,7 +500,7 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -569,7 +563,7 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[EXIT:%.*]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div3_step2_start0_ind2
@@ -625,7 +619,7 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[EXIT:%.*]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -688,7 +682,7 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div1_step3_start0_ind2
@@ -744,7 +738,7 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -807,7 +801,7 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div2_step3_start0_ind2
@@ -863,7 +857,7 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -926,7 +920,7 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div3_step3_start0_ind2
@@ -982,7 +976,7 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1032,15 +1026,14 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
-; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div1_step1_start1_ind2
@@ -1075,15 +1068,14 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
 ; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
-; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1133,15 +1125,14 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
-; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div2_step1_start1_ind2
@@ -1176,15 +1167,14 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
 ; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
-; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1234,15 +1224,14 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
 ; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; VF2-NEXT:    store <2 x i64> [[TMP11]], ptr [[TMP12]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
-; VF2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div3_step1_start1_ind2
@@ -1277,15 +1266,14 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 3
 ; VF4-NEXT:    [[TMP19:%.*]] = add nsw <4 x i64> [[TMP18]], splat (i64 42)
 ; VF4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
-; VF4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP21]], align 8
+; VF4-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP20]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
-; VF4-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1349,7 +1337,7 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div1_step2_start1_ind2
@@ -1406,7 +1394,7 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1470,7 +1458,7 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div2_step2_start1_ind2
@@ -1527,7 +1515,7 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1591,7 +1579,7 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div3_step2_start1_ind2
@@ -1648,7 +1636,7 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1712,7 +1700,7 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div1_step3_start1_ind2
@@ -1769,7 +1757,7 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1833,7 +1821,7 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div2_step3_start1_ind2
@@ -1890,7 +1878,7 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
@@ -1954,7 +1942,7 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; VF2:       middle.block:
-; VF2-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF2-NEXT:    br label [[SCALAR_PH]]
 ; VF2:       scalar.ph:
 ;
 ; VF4-LABEL: define void @ld_div3_step3_start1_ind2
@@ -2011,7 +1999,7 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; VF4:       middle.block:
-; VF4-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VF4-NEXT:    br label [[SCALAR_PH]]
 ; VF4:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
index 3b442a9..0541c9d 100644
--- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
@@ -18,15 +18,14 @@ define void @test_not_first_lane_only_constant(ptr %A, ptr noalias %B)  {
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT5]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[BROADCAST_SPLAT6]], ptr [[TMP2]], align 2
+; CHECK-NEXT:    store <4 x i16> [[BROADCAST_SPLAT6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -88,22 +87,21 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 %
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i16 [[TMP3]], [[X]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP4]], ptr poison, ptr [[B]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT5]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    store <4 x i16> [[BROADCAST_SPLAT6]], ptr [[TMP2]], align 2
+; CHECK-NEXT:    store <4 x i16> [[BROADCAST_SPLAT6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -171,22 +169,21 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i16 [[TMP3]], [[X]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP4]], ptr poison, ptr [[B]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PREDPHI]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT3]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    store <4 x i16> [[BROADCAST_SPLAT4]], ptr [[TMP2]], align 2
+; CHECK-NEXT:    store <4 x i16> [[BROADCAST_SPLAT4]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
index df8123d..3b34b75 100644
--- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -28,10 +28,8 @@ define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -91,10 +89,8 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll b/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll
index 3d3b6c4..f80fb8c 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll
@@ -4,12 +4,10 @@
 ; CHECK-LABEL: vector.body:
 ; CHECK-NEXT:    [[IDX:%.+]] = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK-NEXT:    [[GEP:%.+]] = getelementptr inbounds i16, ptr %src, i32 %index
-; CHECK-NEXT:    [[GEP0:%.+]] = getelementptr inbounds i16, ptr [[GEP]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.+]] = load <4 x i16>, ptr [[GEP0]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.+]] = load <4 x i16>, ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[FSHL:%.+]] = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD]], <4 x i16> splat (i16 15))
 ; CHECK-NEXT:    [[GEP0:%.+]] = getelementptr inbounds i16, ptr %dst, i32 %index
-; CHECK-NEXT:    [[GEP1:%.+]] = getelementptr inbounds i16, ptr [[GEP0]], i32 0
-; CHECK-NEXT:    store <4 x i16> [[FSHL]], ptr [[GEP1]], align 2
+; CHECK-NEXT:    store <4 x i16> [[FSHL]], ptr [[GEP0]], align 2
 ; CHECK-NEXT:    [[IDX_NEXT:%.+]] = add nuw i32 [[IDX]], 4
 ; CHECK-NEXT:    [[EC:%.+]] = icmp eq i32 [[IDX_NEXT]], %n.vec
 ; CHECK-NEXT:    br i1 [[EC]], label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
index d5acf5c..38dbbbb 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
@@ -18,10 +18,9 @@ define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
 ; VF8UF1:       [[VECTOR_BODY]]:
 ; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1
 ; VF8UF1-NEXT:    [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
-; VF8UF1-NEXT:    store <8 x i8> [[TMP2]], ptr [[TMP1]], align 1
+; VF8UF1-NEXT:    store <8 x i8> [[TMP2]], ptr [[NEXT_GEP]], align 1
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
@@ -56,20 +55,18 @@ define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
 ; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 0
 ; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[A]], i32 8
-; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[A]], align 1
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
-; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i32 0
 ; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i32 8
-; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[A]], align 1
 ; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
 ; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !prof [[PROF1:![0-9]+]]
 ; VF8UF2:       [[SCALAR_PH]]:
 ; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
@@ -83,7 +80,7 @@ define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
 ; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF2:![0-9]+]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    ret void
 ;
@@ -98,15 +95,13 @@ define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
 ; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 0
-; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[A]], align 1
 ; VF16UF1-NEXT:    [[TMP2:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
-; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i32 0
-; VF16UF1-NEXT:    store <16 x i8> [[TMP2]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    store <16 x i8> [[TMP2]], ptr [[A]], align 1
 ; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !prof [[PROF1:![0-9]+]]
 ; VF16UF1:       [[SCALAR_PH]]:
 ; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
@@ -120,7 +115,7 @@ define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
 ; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
 ; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF2:![0-9]+]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF16UF1:       [[EXIT]]:
 ; VF16UF1-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index fdd5e0e..3d44317 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -17,8 +17,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF1:       [[VECTOR_BODY]]:
 ; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF8UF1-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
-; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[P_SRC]], align 1
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VF8UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
@@ -28,11 +27,11 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF1:       [[MIDDLE_SPLIT]]:
 ; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
-; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF1:       [[VECTOR_EARLY_EXIT]]:
 ; VF8UF1-NEXT:    br label %[[EXIT]]
 ; VF8UF1:       [[SCALAR_PH]]:
-; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF8UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF8UF1:       [[LOOP_HEADER]]:
 ; VF8UF1-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -55,35 +54,33 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF2:       [[VECTOR_PH]]:
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
-; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF8UF2-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP6]])
-; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
-; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[A]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF8UF2-NEXT:    [[TMP3:%.*]] = or <8 x i1> [[TMP1]], [[TMP2]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF2-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
-; VF8UF2-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
-; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
 ; VF8UF2-NEXT:    br label %[[EXIT]]
 ; VF8UF2:       [[SCALAR_PH]]:
-; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF8UF2-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF8UF2:       [[LOOP_HEADER]]:
-; VF8UF2-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF8UF2-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
-; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
 ; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
 ; VF8UF2-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
 ; VF8UF2:       [[LOOP_LATCH]]:
-; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i8 [[RES]]
@@ -95,19 +92,18 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF16UF1:       [[VECTOR_PH]]:
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0
-; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[A]], align 1
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF16UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
 ; VF16UF1-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF16UF1:       [[MIDDLE_SPLIT]]:
 ; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
-; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF16UF1:       [[VECTOR_EARLY_EXIT]]:
 ; VF16UF1-NEXT:    br label %[[EXIT]]
 ; VF16UF1:       [[SCALAR_PH]]:
-; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF16UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF16UF1:       [[LOOP_HEADER]]:
 ; VF16UF1-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -153,8 +149,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF1:       [[VECTOR_BODY]]:
 ; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF8UF1-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
-; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[P_SRC]], align 1
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; VF8UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
@@ -164,13 +159,13 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF1:       [[MIDDLE_SPLIT]]:
 ; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
-; VF8UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF1:       [[VECTOR_EARLY_EXIT]]:
 ; VF8UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
 ; VF8UF1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; VF8UF1-NEXT:    br label %[[EXIT]]
 ; VF8UF1:       [[SCALAR_PH]]:
-; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF8UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF8UF1:       [[LOOP_HEADER]]:
 ; VF8UF1-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -193,26 +188,29 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2:       [[VECTOR_PH]]:
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
-; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF8UF2-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP6]])
-; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
-; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[A]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF8UF2-NEXT:    [[TMP3:%.*]] = or <8 x i1> [[TMP1]], [[TMP2]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF2-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
-; VF8UF2-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
-; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
-; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP6]], i1 true)
-; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; VF8UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true)
+; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 8, [[TMP5]]
+; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP1]], i1 true)
+; VF8UF2-NEXT:    [[TMP9:%.*]] = add i64 0, [[TMP8]]
+; VF8UF2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP8]], 8
+; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[TMP7]]
+; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
 ; VF8UF2-NEXT:    br label %[[EXIT]]
 ; VF8UF2:       [[SCALAR_PH]]:
-; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF8UF2-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF8UF2:       [[LOOP_HEADER]]:
 ; VF8UF2-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -223,9 +221,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2:       [[LOOP_LATCH]]:
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
-; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP12]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i64 [[RES]]
 ;
 ; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(
@@ -235,21 +233,20 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF16UF1:       [[VECTOR_PH]]:
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0
-; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[A]], align 1
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF16UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
 ; VF16UF1-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF16UF1:       [[MIDDLE_SPLIT]]:
 ; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
-; VF16UF1-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF16UF1:       [[VECTOR_EARLY_EXIT]]:
 ; VF16UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 true)
 ; VF16UF1-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
 ; VF16UF1-NEXT:    br label %[[EXIT]]
 ; VF16UF1:       [[SCALAR_PH]]:
-; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; VF16UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF16UF1:       [[LOOP_HEADER]]:
 ; VF16UF1-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
@@ -284,3 +281,147 @@ exit:
   %res = phi i64 [ %iv, %loop.header ], [ 1, %loop.latch ]
   ret i64 %res
 }
+
+define i8 @test_early_exit_max_vector_tc_eq_16(ptr dereferenceable(17) %A) nosync nofree {
+; VF8UF1-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16(
+; VF8UF1-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
+; VF8UF1-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP1]])
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF1-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1:       [[MIDDLE_SPLIT]]:
+; VF8UF1-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br label %[[SCALAR_PH]]
+; VF8UF1:       [[VECTOR_EARLY_EXIT]]:
+; VF8UF1-NEXT:    br label %[[EXIT:.*]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF8UF1:       [[LOOP_HEADER]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; VF8UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; VF8UF1:       [[LOOP_LATCH]]:
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF1-NEXT:    ret i8 [[RES]]
+;
+; VF8UF2-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16(
+; VF8UF2-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP2]], [[TMP3]]
+; VF8UF2-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]])
+; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF2-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF8UF2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2:       [[MIDDLE_SPLIT]]:
+; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br label %[[SCALAR_PH]]
+; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
+; VF8UF2-NEXT:    br label %[[EXIT:.*]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF8UF2:       [[LOOP_HEADER]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; VF8UF2-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; VF8UF2:       [[LOOP_LATCH]]:
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    ret i8 [[RES]]
+;
+; VF16UF1-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16(
+; VF16UF1-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; VF16UF1-NEXT:    [[TMP1:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP1]])
+; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF16UF1-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1:       [[MIDDLE_SPLIT]]:
+; VF16UF1-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br label %[[SCALAR_PH]]
+; VF16UF1:       [[VECTOR_EARLY_EXIT]]:
+; VF16UF1-NEXT:    br label %[[EXIT:.*]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF16UF1:       [[LOOP_HEADER]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; VF16UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; VF16UF1:       [[LOOP_LATCH]]:
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF16UF1-NEXT:    ret i8 [[RES]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %p.src = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8, ptr %p.src, align 1
+  %c = icmp eq i8 %l, 0
+  br i1 %c, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = add nsw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, 17
+  br i1 %cmp, label %exit, label %loop.header
+
+exit:
+  %res = phi i8 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i8 %res
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll
index 78f5cc7..5f1cee8 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll
@@ -11,15 +11,14 @@ define i64 @remove_loop_region_int_iv_used_outside(ptr %dst) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[DST]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i32 8
-; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP1]], align 8
+; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[DST]], align 8
 ; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -56,15 +55,14 @@ define i64 @remove_loop_region_int_iv_inc_used_outside(ptr %dst) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[DST]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i32 8
-; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP1]], align 8
+; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[DST]], align 8
 ; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -102,17 +100,16 @@ define ptr @remove_loop_region_ptr_iv_used_outside(ptr %dst) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 128
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[DST]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i32 8
-; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP1]], align 8
+; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[DST]], align 8
 ; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -8
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
@@ -152,16 +149,15 @@ define ptr @remove_loop_region_ptr_iv_inc_used_outside(ptr %dst) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 128
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[DST]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i32 8
-; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP1]], align 8
+; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[DST]], align 8
 ; CHECK-NEXT:    store <8 x ptr> zeroinitializer, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index 001bf0c..59c76ae 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -23,10 +23,9 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; VF8UF1:       [[VECTOR_BODY]]:
 ; VF8UF1-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
-; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1
 ; VF8UF1-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
-; VF8UF1-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; VF8UF1-NEXT:    store <8 x i8> [[TMP4]], ptr [[NEXT_GEP]], align 1
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP2]], 8
 ; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -62,15 +61,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[A]], i32 0
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i32 8
-; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[A]], align 1
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
 ; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
 ; VF8UF2-NEXT:    [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
-; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i32 0
 ; VF8UF2-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i32 8
-; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[A]], align 1
 ; VF8UF2-NEXT:    store <8 x i8> [[TMP5]], ptr [[TMP7]], align 1
 ; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
@@ -105,11 +102,9 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[A]], i32 0
-; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[A]], align 1
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
-; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i32 0
-; VF16UF1-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1
+; VF16UF1-NEXT:    store <16 x i8> [[TMP3]], ptr [[A]], align 1
 ; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
@@ -557,11 +552,9 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF8UF1:       [[VECTOR_BODY]]:
 ; VF8UF1-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP0]]
-; VF8UF1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
-; VF8UF1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; VF8UF1-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF1-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 8
 ; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; VF8UF1-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -600,13 +593,11 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; VF8UF2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i32 0
 ; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i32 8
-; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD]], ptr [[DST]], align 1
 ; VF8UF2-NEXT:    store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1
 ; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
@@ -636,7 +627,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF16UF1-NEXT:  [[ENTRY:.*]]:
 ; VF16UF1-NEXT:    br label %[[OUTER_HEADER:.*]]
 ; VF16UF1:       [[OUTER_HEADER]]:
-; VF16UF1-NEXT:    [[TMP0:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF16UF1-NEXT:    [[TMP1:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
 ; VF16UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
 ; VF16UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; VF16UF1:       [[VECTOR_PH]]:
@@ -644,10 +635,8 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
 ; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i32 0
-; VF16UF1-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[DST]], align 1
 ; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -657,7 +646,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF16UF1-NEXT:    br label %[[INNER:.*]]
 ; VF16UF1:       [[INNER]]:
 ; VF16UF1-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
-; VF16UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[INNER_IV]]
+; VF16UF1-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[INNER_IV]]
 ; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
 ; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
 ; VF16UF1-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
@@ -665,7 +654,7 @@ define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias
 ; VF16UF1-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; VF16UF1-NEXT:    br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF16UF1:       [[OUTER_LATCH]]:
-; VF16UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[TMP0]], i64 1
+; VF16UF1-NEXT:    [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[TMP1]], i64 1
 ; VF16UF1-NEXT:    [[C_2:%.*]] = call i1 @cond()
 ; VF16UF1-NEXT:    br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
 ; VF16UF1:       [[EXIT]]:
@@ -1229,6 +1218,133 @@ exit:
   ret void
 }
 
+define void @test_vector_tc_eq_16(ptr %A) {
+; VF8UF1-LABEL: define void @test_vector_tc_eq_16(
+; VF8UF1-SAME: ptr [[A:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1
+; VF8UF1-NEXT:    [[TMP1:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP1]], ptr [[NEXT_GEP]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF1-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br label %[[SCALAR_PH]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @test_vector_tc_eq_16(
+; VF8UF2-SAME: ptr [[A:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    store <8 x i8> [[TMP2]], ptr [[NEXT_GEP]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @test_vector_tc_eq_16(
+; VF16UF1-SAME: ptr [[A:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; VF16UF1-NEXT:    [[TMP1:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    store <16 x i8> [[TMP1]], ptr [[NEXT_GEP]], align 1
+; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF16UF1-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+  %p.src.next = getelementptr inbounds i8, ptr %p.src, i64 1
+  %l = load i8, ptr %p.src, align 1
+  %add = add nsw i8 %l, 10
+  store i8 %add, ptr %p.src
+  %iv.next = add nsw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, 17
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
 ;.
 ; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1238,6 +1354,8 @@ exit:
 ; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
 ; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
 ;.
 ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
@@ -1245,6 +1363,8 @@ exit:
 ; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
 ;.
 ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
@@ -1252,4 +1372,6 @@ exit:
 ; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
index 17979e5f..d3f7794 100644
--- a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
@@ -51,15 +51,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP:       vector.body:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 4
 ; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; NO-VP-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP10]], align 4
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP9]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -98,15 +95,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-DEF:       vector.body:
 ; NO-VP-DEF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-DEF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; NO-VP-DEF-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; NO-VP-DEF-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; NO-VP-DEF-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP4]], align 4
 ; NO-VP-DEF-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; NO-VP-DEF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; NO-VP-DEF-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 1 x i32>, ptr [[TMP7]], align 4
+; NO-VP-DEF-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 1 x i32>, ptr [[TMP6]], align 4
 ; NO-VP-DEF-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
 ; NO-VP-DEF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-DEF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-VP-DEF-NEXT:    store <vscale x 1 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; NO-VP-DEF-NEXT:    store <vscale x 1 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; NO-VP-DEF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; NO-VP-DEF-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-DEF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 8d854dd..2873947 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -10,7 +10,7 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = sext i32 [[OFFSET]] to i64
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header.loopexit:
-; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ]
 ; CHECK-NEXT:    br label [[OUTER_HEADER]]
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
@@ -23,20 +23,19 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
@@ -83,7 +82,7 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header.loopexit:
-; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ]
 ; CHECK-NEXT:    br label [[OUTER_HEADER]]
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
@@ -96,20 +95,19 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
@@ -179,9 +177,9 @@ define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -224,7 +222,7 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1,
 ; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
 ; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
 ; CHECK:       outer.header.loopexit:
-; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ]
 ; CHECK-NEXT:    br label [[OUTER_HEADER]]
 ; CHECK:       outer.header:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
@@ -237,7 +235,7 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1,
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -257,13 +255,12 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1,
 ; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP13]], align 8
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
@@ -352,14 +349,13 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr
 ; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP21]], align 8
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP20]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
@@ -376,7 +372,7 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], 1
 ; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -427,8 +423,7 @@ define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i16> splat (i16 1), ptr [[TMP5]], align 2
+; CHECK-NEXT:    store <4 x i16> splat (i16 1), ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 49a7fb7..128594c 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -104,11 +104,9 @@ define void @iv_expand(ptr %p, i64 %n) {
 ; CHECK-NEXT:   EMIT-SCALAR vp<[[SCALAR_PHI:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
 ; CHECK-NEXT:   WIDEN-PHI ir<%iv> = phi [ vp<[[INDUCTION]]>, ir-bb<vector.ph> ], [ vp<%vec.ind.next>, vector.body ]
 ; CHECK-NEXT:   CLONE ir<%q> = getelementptr ir<%p>, vp<[[SCALAR_PHI]]>
-; CHECK-NEXT:   vp<[[VEC_PTR_1:%.+]]> = vector-pointer ir<%q>
-; CHECK-NEXT:   WIDEN ir<%x> = load vp<[[VEC_PTR_1]]>
+; CHECK-NEXT:   WIDEN ir<%x> = load ir<%q>
 ; CHECK-NEXT:   WIDEN ir<%y> = add ir<%x>, ir<%iv>
-; CHECK-NEXT:   vp<[[VEC_PTR_2:%.+]]> = vector-pointer ir<%q>
-; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR_2]]>, ir<%y>
+; CHECK-NEXT:   WIDEN store ir<%q>, ir<%y>
 ; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<[[SCALAR_PHI]]>, ir<8>
 ; CHECK-NEXT:   EMIT vp<%vec.ind.next> = add ir<%iv>, vp<[[BROADCAST_INC]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<%n.vec>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index d441e41..46f9125 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -21,8 +21,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; CHECK-NEXT:   EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, default.2 ]
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<2>
 ; CHECK-NEXT:   EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]>
-; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[WIDE_PTR]]>
+; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[PTR]]>
 ; CHECK-NEXT:   EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
 ; CHECK-NEXT:   EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
 ; CHECK-NEXT:   EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
index 91e70d2..d856387 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -77,15 +77,13 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) {
 ; CHECK-NEXT: Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.body:
-; CHECK-NEXT:   vp<[[VPTR1:%.]]> = vector-pointer ir<%A>
 ; CHECK-NEXT:   vp<[[VPTR2:%.]]> = vector-pointer ir<%A>, ir<1>
-; CHECK-NEXT:   WIDEN ir<%l> = load vp<[[VPTR1]]>
+; CHECK-NEXT:   WIDEN ir<%l> = load ir<%A>
 ; CHECK-NEXT:   WIDEN ir<%l>.1 = load vp<[[VPTR2]]>
 ; CHECK-NEXT:   WIDEN ir<%add> = add nsw ir<%l>, ir<10>
 ; CHECK-NEXT:   WIDEN ir<%add>.1 = add nsw ir<%l>.1, ir<10>
-; CHECK-NEXT:   vp<[[VPTR3:%.+]]> = vector-pointer ir<%A>
 ; CHECK-NEXT:   vp<[[VPTR4:%.+]]> = vector-pointer ir<%A>, ir<1>
-; CHECK-NEXT:   WIDEN store vp<[[VPTR3]]>, ir<%add>
+; CHECK-NEXT:   WIDEN store ir<%A>, ir<%add>
 ; CHECK-NEXT:   WIDEN store vp<[[VPTR4]]>, ir<%add>.1
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
index 7572a92..5d0d391 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
@@ -15,15 +15,14 @@ define void @pr63340(ptr %A, ptr %B) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[DOTSPLAT]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[DOTSPLAT]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ -128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -74,15 +73,14 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, <4 x ptr> [[BROADCAST_SPLAT]], i64 [[N]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -128,16 +126,15 @@ define void @wide_gep_multiple_indices_some_invariant(ptr noalias %dst, ptr noal
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [10 x float], <4 x ptr> [[BROADCAST_SPLAT]], i32 [[X]], <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll b/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll
index 072ff23..c23d2b3 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll
@@ -12,17 +12,16 @@ define void @powi_only_first_lane_used_of_second_arg(ptr %p, i32 %pow) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[P]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[WIDE_LOAD]], i32 [[POW]])
-; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll b/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
index 8075314..4148c35 100644
--- a/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
+++ b/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
@@ -56,19 +56,14 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_1]], i64 1
 ; CHECK-NEXT:    br label [[INNER_2:%.*]]
 ; CHECK:       inner.2:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[INNER_2]] ], [ 0, [[INNER_1_EXIT]] ]
 ; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[GEP_5]], [[INNER_1_EXIT]] ], [ [[PTR_IV_2_NEXT:%.*]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr inbounds double, ptr [[PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 false, label [[INNER_3_LVER_CHECK:%.*]], label [[INNER_2]]
 ; CHECK:       inner.3.lver.check:
-; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[LCSSA_PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds double, ptr [[PTR_PHI]], i64 1
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVAR_LCSSA]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 24
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_1]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_2]], i64 16
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[GEP_7]], [[GEP_1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR_PHI]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
@@ -90,10 +85,10 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK:       inner.3:
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ 0, [[INNER_3_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER_3]] ]
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds double, ptr [[GEP_6]], i64 [[IV_2]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_7]], align 8, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_8]], align 8, !alias.scope !3
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_7]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_8]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr double, ptr [[PTR_PHI]], i64 [[IV_2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[GEP_9]], align 8, !alias.scope !3
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[GEP_9]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 1
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i64 [[IV_2]], 1
 ; CHECK-NEXT:    br i1 [[C_2]], label [[OUTER_LATCH_LOOPEXIT3:%.*]], label [[INNER_3]]
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index 4a75c5e..31e255b 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -1649,20 +1649,13 @@ loop_exit:
   ret void
 }
 
-; Tests that failure because partial-sized lifetimes are counted as mod.
+; Tests that partial-sized lifetimes are not inhibiting the optimizer
 define void @partial_lifetime() {
 ; CHECK-LABEL: define void @partial_lifetime() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 3, ptr captures(none) [[DEST]])
-; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC]], i64 12, i1 false)
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 3, ptr captures(none) [[SRC]])
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4
+; CHECK-NEXT:    store [[STRUCT_FOO]] { i32 10, i32 20, i32 30 }, ptr [[DEST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr captures(none) [[DEST]])
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca %struct.Foo, align 4
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll b/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll
new file mode 100644
index 0000000..d0450e0
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll
@@ -0,0 +1,130 @@
+;; Make sure we assign the original callsite to a function clone (which will be
+;; the original function clone), even when we cannot update its caller (due to
+;; missing metadata e.g. from mismatched profiles). Otherwise we will try to use
+;; the original function for a different clone, leading to confusion later when
+;; rewriting the calls.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:		-memprof-verify-ccg -memprof-verify-nodes -stats -debug \
+; RUN: 		-pass-remarks=memprof-context-disambiguation %s -S 2>&1 | \
+; RUN:	FileCheck %s --implicit-check-not="Mismatch in call clone assignment" \
+; RUN:		--implicit-check-not="Number of callsites assigned to call multiple non-matching clones"
+
+
+; ModuleID = '<stdin>'
+source_filename = "reduced.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: define void @A()
+define void @A() {
+  ; CHECK: call void @C()
+  call void @C()
+  ret void
+}
+
+; CHECK-LABEL: define void @B()
+define void @B() {
+  ; CHECK: call void @C.memprof.1()
+  call void @C(), !callsite !1
+  ret void
+}
+
+; CHECK-LABEL: define void @C()
+define void @C() {
+  ; CHECK: call void @F()
+  call void @F(), !callsite !16
+  ; CHECK: call void @D()
+  call void @D(), !callsite !2
+  ret void
+}
+
+; CHECK-LABEL: define void @D()
+define void @D() {
+  ; CHECK: call void @E()
+  call void @E(), !callsite !3
+  ; CHECK: call void @G()
+  call void @G(), !callsite !17
+  ret void
+}
+
+; CHECK-LABEL: define void @E()
+define void @E() {
+  ; CHECK: call ptr @_Znwm(i64 0) #[[NOTCOLD:[0-9]+]]
+  %1 = call ptr @_Znwm(i64 0), !memprof !4, !callsite !9
+  ret void
+}
+
+; CHECK-LABEL: define void @F()
+define void @F() {
+  ; CHECK: call void @G()
+  call void @G(), !callsite !17
+  ret void
+}
+
+; CHECK-LABEL: define void @G()
+define void @G() {
+  ; CHECK: call ptr @_Znwm(i64 0) #[[NOTCOLD]]
+  %2 = call ptr @_Znwm(i64 0), !memprof !10, !callsite !15
+  ret void
+}
+
+; CHECK-LABEL: define void @A1()
+define void @A1() {
+  ; CHECK: call void @C()
+  call void @C(), !callsite !18
+  ret void
+}
+
+; CHECK-LABEL: define void @B1()
+define void @B1() {
+  ; CHECK: call void @C.memprof.1()
+  call void @C(), !callsite !19
+  ret void
+}
+
+; CHECK-LABEL: define void @C.memprof.1()
+  ; CHECK: call void @F.memprof.1()
+  ; CHECK: call void @D.memprof.1()
+
+; CHECK-LABEL: define void @D.memprof.1()
+  ; CHECK: call void @E.memprof.1()
+  ; CHECK: call void @G()
+
+; CHECK-LABEL: define void @E.memprof.1()
+  ; CHECK: call ptr @_Znwm(i64 0) #[[COLD:[0-9]+]]
+
+; CHECK-LABEL: define void @F.memprof.1()
+  ; CHECK: call void @G.memprof.1()
+
+; CHECK-LABEL: define void @G.memprof.1()
+  ; CHECK: call ptr @_Znwm(i64 0) #[[COLD]]
+
+declare ptr @_Znwm(i64)
+
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+!0 = !{i64 123}
+!1 = !{i64 234}
+!2 = !{i64 345}
+!3 = !{i64 456}
+!4 = !{!5, !7}
+!5 = !{!6, !"notcold"}
+!6 = !{i64 567, i64 456, i64 345, i64 123}
+!7 = !{!8, !"cold"}
+!8 = !{i64 567, i64 456, i64 345, i64 234}
+!9 = !{i64 567}
+!10 = !{!11, !13}
+!11 = !{!12, !"notcold"}
+!12 = !{i64 678, i64 891, i64 789, i64 912}
+!13 = !{!14, !"cold"}
+!14 = !{i64 678, i64 891, i64 789, i64 812}
+!15 = !{i64 678}
+!16 = !{i64 789}
+!17 = !{i64 891}
+!18 = !{i64 912}
+!19 = !{i64 812}
diff --git a/llvm/test/Transforms/ObjCARC/apelim.ll b/llvm/test/Transforms/ObjCARC/apelim.ll
index 2ac5d15d0..01179f3 100644
--- a/llvm/test/Transforms/ObjCARC/apelim.ll
+++ b/llvm/test/Transforms/ObjCARC/apelim.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -passes=objc-arc-apelim < %s | FileCheck %s
+; RUN: opt -S -passes=objc-arc < %s | FileCheck %s
 ; rdar://10227311
 
 @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__I_x, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__I_y, ptr null }]
diff --git a/llvm/test/Transforms/ObjCARC/comdat-ipo.ll b/llvm/test/Transforms/ObjCARC/comdat-ipo.ll
index 3f91d3b..d43804c 100644
--- a/llvm/test/Transforms/ObjCARC/comdat-ipo.ll
+++ b/llvm/test/Transforms/ObjCARC/comdat-ipo.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -passes=objc-arc-apelim < %s | FileCheck %s
+; RUN: opt -S -passes=objc-arc < %s | FileCheck %s
 
 ; See PR26774
 
diff --git a/llvm/test/Transforms/PGOProfile/prof-verify.ll b/llvm/test/Transforms/PGOProfile/prof-verify.ll
index c83475a..3d984d8 100644
--- a/llvm/test/Transforms/PGOProfile/prof-verify.ll
+++ b/llvm/test/Transforms/PGOProfile/prof-verify.ll
@@ -3,6 +3,7 @@
 ; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s --check-prefix=INJECT
 ; RUN: not opt -passes=prof-verify %s -S -o - 2>&1 | FileCheck %s --check-prefix=VERIFY
 ; RUN: opt -passes=prof-inject,prof-verify %s --disable-output
+; RUN: opt -enable-profcheck %s -S -o - | FileCheck %s --check-prefix=INJECT
 
 define void @foo(i32 %i) {
   %c = icmp eq i32 %i, 0
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 26573a3..a201983 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -80,29 +80,33 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP48:%.*]] = add nsw <16 x i32> [[TMP45]], [[TMP47]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP44]], [[TMP46]]
-; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 3, i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 7>
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 17, i32 1, i32 16, i32 0, i32 19, i32 3, i32 18, i32 2, i32 21, i32 5, i32 20, i32 4, i32 23, i32 7, i32 22, i32 6>
-; CHECK-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP56]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP58]], [[TMP60]]
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15)
-; CHECK-NEXT:    [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP67:%.*]] = mul nuw <16 x i32> [[TMP66]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP64]]
-; CHECK-NEXT:    [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP67]]
-; CHECK-NEXT:    [[TMP70:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP70]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP70]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]]
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], splat (i32 15)
+; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP72:%.*]] = add <16 x i32> [[TMP71]], [[TMP68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i32> [[TMP72]], [[TMP71]]
+; CHECK-NEXT:    [[TMP74:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP73]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP74]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP74]], 16
 ; CHECK-NEXT:    [[RDD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[RDD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
index 07fdc9d..7408ba1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
@@ -4,9 +4,6 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 1, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[ADD]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp samesign ult i32 0, 0
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64
@@ -17,8 +14,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
index 514d5f9..7a1cf7b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll
@@ -6,15 +6,13 @@ define i64 @foo(i32 %tmp7) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP8:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP0]], <i32 0, i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0>, <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 14, i32 poison, i32 poison, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP13]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[TMP13]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0>, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0>, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 0, [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 0, [[TMP8]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP64]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
index 15ba98f..5e3d471 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
@@ -7,17 +7,10 @@ define i32 @test() {
 ; CHECK-NEXT:    br label %[[FUNC_135_EXIT_I:.*]]
 ; CHECK:       [[FUNC_135_EXIT_I]]:
 ; CHECK-NEXT:    [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 23, i32 8, i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <12 x i32> [[TMP3]], <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP17]], <16 x i32> [[TMP8]], <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 23, i32 24, i32 25, i32 26, i32 2, i32 2, i32 2, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison>, [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
index 1c482e0..03d76ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll
@@ -4,11 +4,10 @@
 define i64 @test() {
 ; CHECK-LABEL: define i64 @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 0, i32 1
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i32> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    br label %[[BB5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
index 652abef..6bb52e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll
@@ -7,19 +7,17 @@ define void @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[IF_THEN_I_I:.*]]:
-; CHECK-NEXT:    br label %[[BB5:.*]]
+; CHECK-NEXT:    br label %[[BB3:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i1 false to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <4 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    br i1 false, label %[[BB5]], label %[[BB2:.*]]
-; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, i64 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    br i1 false, label %[[BB3]], label %[[BB2:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i64> [ [[TMP2]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ]
 ; CHECK-NEXT:    br label %[[BB2]]
 ; CHECK:       [[BB2]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB3]] ], [ [[TMP2]], %[[BB1]] ]
 ; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
index a4949bc..782aada 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -6,14 +6,9 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[F]], 1
-; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECINIT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VECINIT51:%.*]] = add <4 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[VECINIT51]]
 ;
   %vecinit = insertelement <4 x i32> undef, i32 %f, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
index 049bb2e..c4d133a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_extract_broadcast.ll
@@ -18,7 +18,7 @@ define void @fextr(ptr %ptr) {
 ; YAML-NEXT: Function:        fextr
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '-20'
+; YAML-NEXT:   - Cost:            '-21'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '4'
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
index 8f6a53c..f7811ab 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -6,14 +6,6 @@ define <4 x i16> @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1
-; CHECK-NEXT:    [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2
-; CHECK-NEXT:    [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
@@ -28,8 +20,7 @@ define <4 x i16> @test() {
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
 ; CHECK-NEXT:    [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX9]], [[TMP35]]
-; CHECK-NEXT:    ret <4 x i16> [[OP_RDX11]]
+; CHECK-NEXT:    ret <4 x i16> [[OP_RDX9]]
 ;
 entry:
   %subi = add <4 x i16> zeroinitializer, zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/user-node-no-state.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-node-no-state.ll
new file mode 100644
index 0000000..237f308
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-node-no-state.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@g = global [128 x i8] zeroinitializer, align 16
+
+define i64 @test() {
+; CHECK-LABEL: define i64 @test() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @g, align 8
+; CHECK-NEXT:    br label %[[FUNC_154_EXIT_FUNC_146_EXIT_CRIT_EDGE_I:.*]]
+; CHECK:       [[FUNC_154_EXIT_FUNC_146_EXIT_CRIT_EDGE_I]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 80), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 88), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 32), align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @g, align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @g, align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 24), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], [[TMP0]]
+; CHECK-NEXT:    ret i64 [[TMP14]]
+;
+entry:
+  %0 = load i64, ptr @g, align 8
+  br label %func_154.exit.func_146.exit_crit_edge.i
+
+func_154.exit.func_146.exit_crit_edge.i:
+  %1 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 80), align 16
+  %2 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 88), align 8
+  %3 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 32), align 16
+  %4 = load i64, ptr @g, align 16
+  %5 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 8), align 8
+  %6 = load i64, ptr @g, align 16
+  %7 = load i64, ptr getelementptr inbounds nuw (i8, ptr @g, i64 24), align 8
+  %8 = xor i64 %1, %2
+  %9 = xor i64 %8, %3
+  %10 = xor i64 %9, %4
+  %11 = xor i64 %10, %5
+  %12 = xor i64 %11, %6
+  %13 = xor i64 %12, %7
+  %14 = xor i64 %13, %0
+  ret i64 %14
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index ad4daea..125c2dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -150,9 +150,9 @@ define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead)
 define <2 x i8> @replace_through_binop_fail_cant_speculate(i8 %inp, <2 x i8> %d, <2 x i8> %any) {
 ; CHECK-LABEL: define <2 x i8> @replace_through_binop_fail_cant_speculate(
 ; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) {
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[INP]], 5
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i64 0
-; CHECK-NEXT:    [[V:%.*]] = insertelement <2 x i8> [[V0]], i8 [[ADD]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[V:%.*]] = add <2 x i8> [[TMP2]], <i8 0, i8 5>
 ; CHECK-NEXT:    [[DIV0:%.*]] = sdiv <2 x i8> splat (i8 -128), [[V]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[INP]], 123
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i8> [[DIV0]], i8 [[TMP1]], i64 0
diff --git a/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll b/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll
new file mode 100644
index 0000000..10d4fa2
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/hoist-zerocost.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=structurizecfg < %s | FileCheck %s
+
+
+%pair = type { i32, i32 }
+define void @test_if_then_else(ptr %ptr, i1 %cond) {
+; CHECK-LABEL: define void @test_if_then_else(
+; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[LOAD_THEN:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
+; CHECK-NEXT:    br i1 [[COND_INV]], label %[[ELSE:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[SUM_ELSE:%.*]], %[[ELSE]] ], [ [[A_THEN]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[THEN:.*]], label %[[MERGE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
+; CHECK-NEXT:    [[SUM_ELSE]] = add i32 [[A_ELSE]], 1
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load_then = load %pair, ptr %ptr
+  br i1 %cond, label %then, label %else
+
+then:
+  %a_then = extractvalue %pair %load_then, 0
+  br label %merge
+
+else:
+  %a_else = extractvalue %pair %load_then, 0
+  %sum_else = add i32 %a_else, 1
+  br label %merge
+
+merge:
+  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+define void @test_if_else_then(ptr %ptr, i1 %cond) {
+; CHECK-LABEL: define void @test_if_else_then(
+; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[LOAD_THEN:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
+; CHECK-NEXT:    br i1 [[COND_INV]], label %[[THEN:.*]], label %[[FLOW:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[A_THEN]], %[[THEN]] ], [ poison, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[THEN]] ], [ true, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[ELSE:.*]], label %[[MERGE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
+; CHECK-NEXT:    [[SUM_ELSE:%.*]] = add i32 [[A_ELSE]], 1
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[SUM_ELSE]], %[[ELSE]] ]
+; CHECK-NEXT:    store i32 [[PHI]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load_then = load %pair, ptr %ptr
+  br i1 %cond, label %else, label %then
+
+then:
+  %a_then = extractvalue %pair %load_then, 0
+  br label %merge
+
+else:
+  %a_else = extractvalue %pair %load_then, 0
+  %sum_else = add i32 %a_else, 1
+  br label %merge
+
+merge:
+  %phi = phi i32  [ %a_then, %then ], [ %sum_else, %else ]
+  store i32 %phi, ptr  %ptr
+  ret void
+}
+
+define amdgpu_kernel void @test_loop_with_if( ptr %ptr, i1 %cond) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @test_loop_with_if(
+; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[COND_INV:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I15:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[FLOW:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[I9:%.*]] = icmp sle i32 [[I3]], 10
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW1:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[Y:%.*]], %[[ELSE:.*]] ], [ [[A_THEN]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ [[TMP2:%.*]], %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[THEN:.*]], label %[[LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i32 [[I]], ptr [[PTR]], align 4
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP2]] = phi i1 [ true, %[[IF]] ], [ false, %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[I9]], %[[IF]] ], [ [[COND_INV]], %[[LOOP]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[ELSE]], label %[[FLOW1]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[I2:%.*]] = extractvalue [[PAIR]] [[LOAD]], 1
+; CHECK-NEXT:    [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD]], 0
+; CHECK-NEXT:    [[Y]] = add i32 [[A_ELSE]], [[I2]]
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[I15]] = add nsw i32 [[TMP0]], 20
+; CHECK-NEXT:    [[I16:%.*]] = icmp sge i32 [[I15]], 255
+; CHECK-NEXT:    br i1 [[I16]], label %[[END:.*]], label %[[LOOP]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %loop
+
+loop:
+  %entry_phi = phi i32 [ 0, %entry ], [ %a15, %latch ]
+  %load = load %pair, ptr %ptr
+  br i1 %cond, label %if, label %else
+
+if:
+  %cmp = icmp sgt i32 %entry_phi, 10
+  br i1 %cmp, label %then, label %else
+
+then:
+  %a_then = extractvalue %pair %load, 0
+  store i32 %a, ptr %ptr, align 4
+  br label %latch
+
+else:
+  %a2 = extractvalue %pair %load, 1
+  %y = extractvalue %pair %load, 0
+  %a_else = add i32 %y, %a2
+  br label %latch
+
+latch:
+  %a_test = phi i32 [ %a_then, %then ], [ %a_else, %else ]
+  store i32 %a_test, ptr  %ptr
+  %a15 = add nsw i32 %a_test, 20
+  %a16 = icmp slt i32  %a15, 255
+  br i1 %a16, label %loop, label %end
+
+end:
+  ret void
+}
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index ee76beb..893e2cb 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -55,6 +55,7 @@ config.linked_bye_extension = @LLVM_BYE_LINK_INTO_TOOLS@
 config.linked_exampleirtransforms_extension = @LLVM_EXAMPLEIRTRANSFORMS_LINK_INTO_TOOLS@
 config.have_tf_aot = @LLVM_HAVE_TF_AOT@
 config.have_tflite = @LLVM_HAVE_TFLITE@
+config.enable_profcheck = @LLVM_ENABLE_PROFCHECK@
 config.llvm_inliner_model_autogenerated = @LLVM_INLINER_MODEL_AUTOGENERATED@
 config.llvm_raevict_model_autogenerated = @LLVM_RAEVICT_MODEL_AUTOGENERATED@
 config.expensive_checks = @LLVM_ENABLE_EXPENSIVE_CHECKS@
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
index 51cafac..e1da112 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
@@ -10,15 +10,15 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
 ; CHECK-NEXT:    .reg .b64 %rd<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [caller_St8x4_param_0+8];
-; CHECK-NEXT:    ld.param.b64 %rd2, [caller_St8x4_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd3, [caller_St8x4_param_0+24];
-; CHECK-NEXT:    ld.param.b64 %rd4, [caller_St8x4_param_0+16];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[32];
-; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd2, %rd1};
-; CHECK-NEXT:    st.param.v2.b64 [param0+16], {%rd4, %rd3};
 ; CHECK-NEXT:    .param .align 16 .b8 retval0[32];
+; CHECK-NEXT:    ld.param.b64 %rd1, [caller_St8x4_param_0+24];
+; CHECK-NEXT:    ld.param.b64 %rd2, [caller_St8x4_param_0+16];
+; CHECK-NEXT:    st.param.v2.b64 [param0+16], {%rd2, %rd1};
+; CHECK-NEXT:    ld.param.b64 %rd3, [caller_St8x4_param_0+8];
+; CHECK-NEXT:    ld.param.b64 %rd4, [caller_St8x4_param_0];
+; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd4, %rd3};
 ; CHECK-NEXT:    call.uni (retval0), callee_St8x4, (param0);
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [retval0];
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [retval0+16];
diff --git a/llvm/test/tools/llvm-readtapi/many-targets.test b/llvm/test/tools/llvm-readtapi/many-targets.test
new file mode 100644
index 0000000..efb44b5
--- /dev/null
+++ b/llvm/test/tools/llvm-readtapi/many-targets.test
@@ -0,0 +1,20 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+;
+; RUN: llvm-readtapi %t/many-targets.tbd
+;
+; Check that tbds containing symbols with many targets parse correctly (and in
+; particular parse without leaks).
+
+;--- many-targets.tbd
+--- !tapi-tbd
+tbd-version:     4
+targets:         [ x86_64-macos, x86_64-maccatalyst, arm64-macos, arm64-maccatalyst,
+                   arm64e-macos, arm64e-maccatalyst, arm64-ios, arm64e-ios ]
+install-name:    '/usr/lib/foo.dylib'
+current-version: 1
+exports:
+  - targets:         [ x86_64-macos, x86_64-maccatalyst, arm64-macos, arm64-maccatalyst,
+                       arm64e-macos, arm64e-maccatalyst, arm64-ios, arm64e-ios ]
+    symbols:         [ 'foo' ]
+...
diff --git a/llvm/tools/dsymutil/MachOUtils.cpp b/llvm/tools/dsymutil/MachOUtils.cpp
index be1934f..362a999 100644
--- a/llvm/tools/dsymutil/MachOUtils.cpp
+++ b/llvm/tools/dsymutil/MachOUtils.cpp
@@ -331,7 +331,7 @@ static bool createDwarfSegment(const MCAssembler& Asm,uint64_t VMAddr, uint64_t
                                  /* InitProt =*/3);
 
   for (unsigned int i = 0, n = Writer.getSectionOrder().size(); i != n; ++i) {
-    MCSection *Sec = Writer.getSectionOrder()[i];
+    auto *Sec = static_cast<MCSectionMachO *>(Writer.getSectionOrder()[i]);
     if (!Asm.getSectionFileSize(*Sec))
       continue;
 
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 7d168a6..b9b8929 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Debugify.h"
+#include "llvm/Transforms/Utils/ProfileVerify.h"
 
 using namespace llvm;
 using namespace opt_tool;
@@ -356,7 +357,7 @@ bool llvm::runPassPipeline(
     OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder,
     bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
     bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
-    bool UnifiedLTO) {
+    bool EnableProfcheck, bool UnifiedLTO) {
   auto FS = vfs::getRealFileSystem();
   std::optional<PGOOptions> P;
   switch (PGOKindFlag) {
@@ -487,7 +488,8 @@ bool llvm::runPassPipeline(
   if (VerifyDIPreserve)
     MPM.addPass(NewPMDebugifyPass(DebugifyMode::OriginalDebugInfo, "",
                                   &DebugInfoBeforePass));
-
+  if (EnableProfcheck)
+    MPM.addPass(createModuleToFunctionPassAdaptor(ProfileInjectorPass()));
   // Add passes according to the -passes options.
   if (!PassPipeline.empty()) {
     if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) {
@@ -504,6 +506,8 @@ bool llvm::runPassPipeline(
     MPM.addPass(NewPMCheckDebugifyPass(
         false, "", nullptr, DebugifyMode::OriginalDebugInfo,
         &DebugInfoBeforePass, VerifyDIPreserveExport));
+  if (EnableProfcheck)
+    MPM.addPass(createModuleToFunctionPassAdaptor(ProfileVerifierPass()));
 
   // Add any relevant output pass at the end of the pipeline.
   switch (OK) {
diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h
index 2daae57..6c21d6c 100644
--- a/llvm/tools/opt/NewPMDriver.h
+++ b/llvm/tools/opt/NewPMDriver.h
@@ -75,7 +75,7 @@ bool runPassPipeline(
     bool ShouldPreserveAssemblyUseListOrder,
     bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
     bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
-    bool UnifiedLTO = false);
+    bool EnableProfcheck, bool UnifiedLTO = false);
 } // namespace llvm
 
 #endif
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 892b63b..4a3b058 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -217,6 +217,15 @@ static cl::opt<bool> VerifyDebugInfoPreserve(
     cl::desc("Start the pipeline with collecting and end it with checking of "
              "debug info preservation."));
 
+static cl::opt<bool> EnableProfileVerification(
+    "enable-profcheck",
+#if defined(LLVM_ENABLE_PROFCHECK)
+    cl::init(true),
+#else
+    cl::init(false),
+#endif
+    cl::desc("Start the pipeline with prof-inject and end it with prof-check"));
+
 static cl::opt<std::string> ClDataLayout("data-layout",
                                          cl::desc("data layout string to use"),
                                          cl::value_desc("layout-string"),
@@ -746,7 +755,8 @@ extern "C" int optMain(
                RemarksFile.get(), Pipeline, PluginList, PassBuilderCallbacks,
                OK, VK, PreserveAssemblyUseListOrder,
                PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash,
-               EnableDebugify, VerifyDebugInfoPreserve, UnifiedLTO)
+               EnableDebugify, VerifyDebugInfoPreserve,
+               EnableProfileVerification, UnifiedLTO)
                ? 0
                : 1;
   }
diff --git a/llvm/tools/spirv-tools/CMakeLists.txt b/llvm/tools/spirv-tools/CMakeLists.txt
index c2c0f3e..5db7aec 100644
--- a/llvm/tools/spirv-tools/CMakeLists.txt
+++ b/llvm/tools/spirv-tools/CMakeLists.txt
@@ -5,10 +5,6 @@ if (NOT LLVM_INCLUDE_SPIRV_TOOLS_TESTS)
   return()
 endif ()
 
-if (NOT "SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
-  message(FATAL_ERROR "Building SPIRV-Tools tests is unsupported without the SPIR-V target")
-endif ()
-
 # SPIRV_DIS, SPIRV_VAL, SPIRV_AS and SPIRV_LINK variables can be used to provide paths to existing
 # spirv-dis, spirv-val, spirv-as, and spirv-link binaries, respectively. Otherwise, build them from
 # SPIRV-Tools source.
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 7c9a546..e288585 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -364,9 +364,9 @@ TEST_F(IR2VecTestFixture, GetFunctionVector) {
   EXPECT_TRUE(FuncVec.approximatelyEquals(Embedding(2, 44.4)));
 }
 
-static constexpr unsigned MaxOpcodes = 67;
-static constexpr unsigned MaxTypeIDs = 21;
-static constexpr unsigned MaxOperands = 4;
+static constexpr unsigned MaxOpcodes = Vocabulary::MaxOpcodes;
+static constexpr unsigned MaxTypeIDs = Vocabulary::MaxTypeIDs;
+static constexpr unsigned MaxOperands = Vocabulary::MaxOperandKinds;
 
 TEST(IR2VecVocabularyTest, DummyVocabTest) {
   for (unsigned Dim = 1; Dim <= 10; ++Dim) {
diff --git a/llvm/unittests/CodeGen/DroppedVariableStatsMIRTest.cpp b/llvm/unittests/CodeGen/DroppedVariableStatsMIRTest.cpp
index 36504f5..e72b4e4 100644
--- a/llvm/unittests/CodeGen/DroppedVariableStatsMIRTest.cpp
+++ b/llvm/unittests/CodeGen/DroppedVariableStatsMIRTest.cpp
@@ -39,8 +39,7 @@ createTargetMachine(std::string TargetStr, StringRef CPU, StringRef FS) {
     return nullptr;
   TargetOptions Options;
   return std::unique_ptr<TargetMachine>(
-      static_cast<TargetMachine *>(T->createTargetMachine(
-          TT, CPU, FS, Options, std::nullopt, std::nullopt)));
+      T->createTargetMachine(TT, CPU, FS, Options, std::nullopt, std::nullopt));
 }
 
 std::unique_ptr<Module> parseMIR(const TargetMachine &TM, StringRef MIRCode,
diff --git a/llvm/unittests/Support/DebugLogTest.cpp b/llvm/unittests/Support/DebugLogTest.cpp
index 5136999..0c464c1 100644
--- a/llvm/unittests/Support/DebugLogTest.cpp
+++ b/llvm/unittests/Support/DebugLogTest.cpp
@@ -6,7 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+// This macro is defined in the LLVM build system, but we undefine it here
+// so that we test at least once in-tree the case where __SHORT_FILE__ is not
+// defined.
+#undef __SHORT_FILE__
+
 #include "llvm/Support/DebugLog.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -26,8 +32,8 @@ TEST(DebugLogTest, Basic) {
   {
     std::string str;
     raw_string_ostream os(str);
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, nullptr) << "NoType";
-    EXPECT_TRUE(StringRef(os.str()).starts_with('['));
+    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, nullptr) << "NoType";
+    EXPECT_FALSE(StringRef(os.str()).starts_with('['));
     EXPECT_TRUE(StringRef(os.str()).ends_with("NoType\n"));
   }
 
@@ -35,8 +41,9 @@ TEST(DebugLogTest, Basic) {
   {
     std::string str;
     raw_string_ostream os(str);
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "A";
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << "B";
+    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "A";
+    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << "B";
+    EXPECT_TRUE(StringRef(os.str()).starts_with('['));
     EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), HasSubstr("B\n")));
   }
 
@@ -46,27 +53,82 @@ TEST(DebugLogTest, Basic) {
     raw_string_ostream os(str);
     // Just check that the macro doesn't result in dangling else.
     if (true)
-      DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "A";
+      DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "A";
     else
-      DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "B";
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << "B";
+      DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "B";
+    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << "B";
     EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), Not(HasSubstr("B\n"))));
 
     int count = 0;
     auto inc = [&]() { return ++count; };
     EXPECT_THAT(count, Eq(0));
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << inc();
+    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << inc();
     EXPECT_THAT(count, Eq(1));
-    DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << inc();
+    DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << inc();
     EXPECT_THAT(count, Eq(1));
   }
 }
+
+TEST(DebugLogTest, BasicWithLevel) {
+  llvm::DebugFlag = true;
+  // We expect A to be always printed, B to be printed only when level is 1 or
+  // below, and C to be printed only when level is 0 or below.
+  static const char *DT[] = {"A", "B:1", "C:"};
+
+  setCurrentDebugTypes(DT, sizeof(DT) / sizeof(DT[0]));
+  std::string str;
+  raw_string_ostream os(str);
+  for (auto type : {"A", "B", "C", "D"})
+    for (int level : llvm::seq<int>(0, 4))
+      DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(os, level, type, type, level)
+          << level;
+  EXPECT_EQ(os.str(), "[A:0] A:0 0\n[A:1] A:1 1\n[A:2] A:2 2\n[A:3] A:3 "
+                      "3\n[B:0] B:0 0\n[B:1] B:1 1\n[C:0] C:0 0\n");
+}
+
+TEST(DebugLogTest, NegativeLevel) {
+  llvm::DebugFlag = true;
+  // Test the special behavior when all the levels are 0.
+  // In this case we expect all the debug types to be printed.
+  static const char *DT[] = {"A:"};
+
+  setCurrentDebugTypes(DT, sizeof(DT) / sizeof(DT[0]));
+  std::string str;
+  raw_string_ostream os(str);
+  for (auto type : {"A", "B"})
+    for (int level : llvm::seq<int>(0, 2))
+      DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(os, level, type, type, level)
+          << level;
+  EXPECT_EQ(os.str(), "[A:0] A:0 0\n[B:0] B:0 0\n[B:1] B:1 1\n");
+}
+
+TEST(DebugLogTest, StreamPrefix) {
+  llvm::DebugFlag = true;
+  static const char *DT[] = {"A", "B"};
+  setCurrentDebugTypes(DT, 2);
+
+  std::string str;
+  raw_string_ostream os(str);
+  std::string expected = "PrefixA 1\nPrefixA 2\nPrefixA \nPrefixB "
+                         "3\nPrefixB 4\nPrefixA 5";
+  {
+    llvm::impl::raw_ldbg_ostream ldbg_osB("PrefixB ", os);
+    llvm::impl::raw_ldbg_ostream ldbg_osA("PrefixA ", os);
+    ldbg_osA << "1\n2";
+    ldbg_osA << "\n\n";
+    ldbg_osB << "3\n4\n";
+    ldbg_osA << "5";
+    EXPECT_EQ(os.str(), expected);
+  }
+  // After destructors, there was a pending newline for stream B.
+  EXPECT_EQ(os.str(), expected + "\nPrefixB \n");
+}
 #else
 TEST(DebugLogTest, Basic) {
   // LDBG should be compiled out in NDEBUG, so just check it compiles and has
   // no effect.
   llvm::DebugFlag = true;
-  static const char *DT[] = {};
+  static const char *DT[] = {"A"};
   setCurrentDebugTypes(DT, 0);
   int count = 0;
   auto inc = [&]() { return ++count; };
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index 36408de..35927e3 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -758,6 +758,12 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::UnknownOS, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("riscv64-meta-unknown-mtia");
+  EXPECT_EQ(Triple::riscv64, T.getArch());
+  EXPECT_EQ(Triple::Meta, T.getVendor());
+  EXPECT_EQ(Triple::UnknownOS, T.getOS());
+  EXPECT_EQ(Triple::MTIA, T.getEnvironment());
+
   T = Triple("riscv64-unknown-linux");
   EXPECT_EQ(Triple::riscv64, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 8012d9e..94b74d2 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1075,7 +1075,7 @@ TEST_F(VPRecipeTest, CastVPBlendRecipeToVPUser) {
   Args.push_back(I1);
   Args.push_back(I2);
   Args.push_back(M2);
-  VPBlendRecipe Recipe(Phi, Args);
+  VPBlendRecipe Recipe(Phi, Args, {});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 5394472..6214ea3 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -103,7 +103,7 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPInstruction *BranchOnCond =
       new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
-  auto *Blend = new VPBlendRecipe(Phi, {DefI});
+  auto *Blend = new VPBlendRecipe(Phi, {DefI}, {});
 
   VPBasicBlock *VPBB1 = Plan.getEntry();
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 7f90d6b..a280604 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -242,11 +242,10 @@ public:
     SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
         AllRuntimeLibcallImplsRaw);
 
-    // Sort by libcall impl name, not the enum name. This keeps the order
-    // suitable for using the name table for libcall recognition binary search.
-    llvm::sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
-      return A->getValueAsString("LibCallFuncName") <
-             B->getValueAsString("LibCallFuncName");
+    // Sort by libcall impl name and secondarily by the enum name.
+    sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
+      return std::pair(A->getValueAsString("LibCallFuncName"), A->getName()) <
+             std::pair(B->getValueAsString("LibCallFuncName"), B->getName());
     });
 
     RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index e80adce..89c175b 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -86,16 +86,22 @@ namespace {
 class CompressInstEmitter {
   struct OpData {
     enum MapKind { Operand, Imm, Reg } Kind;
-    union {
+    // Info for an operand.
+    struct OpndInfo {
+      // Record from the Dag.
+      const Record *DagRec;
       // Operand number mapped to.
-      unsigned OpNo;
+      unsigned Idx;
+      // Tied operand index within the instruction.
+      int TiedOpIdx;
+    };
+    union {
+      OpndInfo OpInfo;
       // Integer immediate value.
       int64_t ImmVal;
       // Physical register.
       const Record *RegRec;
     };
-    // Tied operand index within the instruction.
-    int TiedOpIdx = -1;
   };
   struct ArgData {
     unsigned DAGOpNo;
@@ -273,6 +279,31 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
                               "' in the corresponding instruction operand!");
 
         OperandMap[OpNo].Kind = OpData::Operand;
+        OperandMap[OpNo].OpInfo.DagRec = DI->getDef();
+        OperandMap[OpNo].OpInfo.TiedOpIdx = -1;
+
+        // Create a mapping between the operand name in the Dag (e.g. $rs1) and
+        // its index in the list of Dag operands and check that operands with
+        // the same name have the same type. For example in 'C_ADD $rs1, $rs2'
+        // we generate the mapping $rs1 --> 0, $rs2 ---> 1. If the operand
+        // appears twice in the same Dag (tied in the compressed instruction),
+        // we note the previous index in the TiedOpIdx field.
+        StringRef ArgName = Dag->getArgNameStr(DAGOpNo);
+        if (ArgName.empty())
+          continue;
+
+        if (IsSourceInst) {
+          auto It = Operands.find(ArgName);
+          if (It != Operands.end()) {
+            OperandMap[OpNo].OpInfo.TiedOpIdx = It->getValue().MIOpNo;
+            if (OperandMap[It->getValue().MIOpNo].OpInfo.DagRec != DI->getDef())
+              PrintFatalError(Rec->getLoc(),
+                              "Input Operand '" + ArgName +
+                                  "' has a mismatched tied operand!");
+          }
+        }
+
+        Operands[ArgName] = {DAGOpNo, OpNo};
       } else if (const auto *II = dyn_cast<IntInit>(Dag->getArg(DAGOpNo))) {
         // Validate that corresponding instruction operand expects an immediate.
         if (!OpndRec->isSubClassOf("Operand"))
@@ -292,30 +323,6 @@ void CompressInstEmitter::addDagOperandMapping(const Record *Rec,
       } else {
         llvm_unreachable("Unhandled CompressPat argument type!");
       }
-
-      // Create a mapping between the operand name in the Dag (e.g. $rs1) and
-      // its index in the list of Dag operands and check that operands with the
-      // same name have the same type. For example in 'C_ADD $rs1, $rs2' we
-      // generate the mapping $rs1 --> 0, $rs2 ---> 1. If the operand appears
-      // twice in the same Dag (tied in the compressed instruction), we note
-      // the previous index in the TiedOpIdx field.
-      StringRef ArgName = Dag->getArgNameStr(DAGOpNo);
-      if (ArgName.empty())
-        continue;
-
-      if (IsSourceInst) {
-        auto It = Operands.find(ArgName);
-        if (It != Operands.end()) {
-          OperandMap[OpNo].TiedOpIdx = It->getValue().MIOpNo;
-          if (!validateArgsTypes(Dag->getArg(It->getValue().DAGOpNo),
-                                 Dag->getArg(DAGOpNo)))
-            PrintFatalError(Rec->getLoc(),
-                            "Input Operand '" + ArgName +
-                                "' has a mismatched tied operand!");
-        }
-      }
-
-      Operands[ArgName] = {DAGOpNo, OpNo};
     }
   }
 
@@ -372,8 +379,9 @@ void CompressInstEmitter::createInstOperandMapping(
       if (DestOperandMap[OpNo].Kind == OpData::Operand)
         // No need to fill the SourceOperandMap here since it was mapped to
         // destination operand 'TiedInstOpIdx' in a previous iteration.
-        LLVM_DEBUG(dbgs() << "    " << DestOperandMap[OpNo].OpNo << " ====> "
-                          << OpNo << "  Dest operand tied with operand '"
+        LLVM_DEBUG(dbgs() << "    " << DestOperandMap[OpNo].OpInfo.Idx
+                          << " ====> " << OpNo
+                          << "  Dest operand tied with operand '"
                           << TiedInstOpIdx << "'\n");
       ++OpNo;
       continue;
@@ -398,8 +406,8 @@ void CompressInstEmitter::createInstOperandMapping(
              "Incorrect operand mapping detected!\n");
 
       unsigned SourceOpNo = SourceOp->getValue().MIOpNo;
-      DestOperandMap[OpNo].OpNo = SourceOpNo;
-      SourceOperandMap[SourceOpNo].OpNo = OpNo;
+      DestOperandMap[OpNo].OpInfo.Idx = SourceOpNo;
+      SourceOperandMap[SourceOpNo].OpInfo.Idx = OpNo;
       LLVM_DEBUG(dbgs() << "    " << SourceOpNo << " ====> " << OpNo << "\n");
     }
   }
@@ -558,8 +566,6 @@ static void printPredicates(ArrayRef<const Record *> Predicates, StringRef Name,
 
 static void mergeCondAndCode(raw_ostream &CombinedStream, StringRef CondStr,
                              StringRef CodeStr) {
-  // Remove first indentation and last '&&'.
-  CondStr = CondStr.drop_front(8).drop_back(4);
   CombinedStream.indent(4) << "if (" << CondStr << ") {\n";
   CombinedStream << CodeStr;
   CombinedStream.indent(4) << "  return true;\n";
@@ -640,7 +646,7 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
   StringRef PrevOp;
   StringRef CurOp;
   CaseStream << "  switch (MI.getOpcode()) {\n";
-  CaseStream << "    default: return false;\n";
+  CaseStream << "  default: return false;\n";
 
   bool CompressOrCheck =
       EType == EmitterType::Compress || EType == EmitterType::CheckCompress;
@@ -653,7 +659,7 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
                 .str()
           : "";
 
-  for (auto &CompressPat : CompressPatterns) {
+  for (const auto &CompressPat : CompressPatterns) {
     if (EType == EmitterType::Uncompress && CompressPat.IsCompressOnly)
       continue;
 
@@ -661,23 +667,25 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
     std::string CodeString;
     raw_string_ostream CondStream(CondString);
     raw_string_ostream CodeStream(CodeString);
-    CodeGenInstruction &Source =
+    const CodeGenInstruction &Source =
         CompressOrCheck ? CompressPat.Source : CompressPat.Dest;
-    CodeGenInstruction &Dest =
+    const CodeGenInstruction &Dest =
         CompressOrCheck ? CompressPat.Dest : CompressPat.Source;
-    IndexedMap<OpData> SourceOperandMap = CompressOrCheck
-                                              ? CompressPat.SourceOperandMap
-                                              : CompressPat.DestOperandMap;
-    IndexedMap<OpData> &DestOperandMap = CompressOrCheck
-                                             ? CompressPat.DestOperandMap
-                                             : CompressPat.SourceOperandMap;
+    const IndexedMap<OpData> &SourceOperandMap =
+        CompressOrCheck ? CompressPat.SourceOperandMap
+                        : CompressPat.DestOperandMap;
+    const IndexedMap<OpData> &DestOperandMap =
+        CompressOrCheck ? CompressPat.DestOperandMap
+                        : CompressPat.SourceOperandMap;
 
     CurOp = Source.TheDef->getName();
     // Check current and previous opcode to decide to continue or end a case.
     if (CurOp != PrevOp) {
-      if (!PrevOp.empty())
-        CaseStream.indent(6) << "break;\n    } // case " + PrevOp + "\n";
-      CaseStream.indent(4) << "case " + TargetName + "::" + CurOp + ": {\n";
+      if (!PrevOp.empty()) {
+        CaseStream.indent(4) << "break;\n";
+        CaseStream.indent(2) << "} // case " + PrevOp + "\n";
+      }
+      CaseStream.indent(2) << "case " + TargetName + "::" + CurOp + ": {\n";
     }
 
     std::set<std::pair<bool, StringRef>> FeaturesSet;
@@ -694,17 +702,18 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
     });
     getReqFeatures(FeaturesSet, AnyOfFeatureSets, ReqFeatures);
 
+    ListSeparator CondSep(" &&\n        ");
+
     // Emit checks for all required features.
     for (auto &Op : FeaturesSet) {
       StringRef Not = Op.first ? "!" : "";
-      CondStream.indent(8) << Not << "STI.getFeatureBits()[" << TargetName
-                           << "::" << Op.second << "]"
-                           << " &&\n";
+      CondStream << CondSep << Not << "STI.getFeatureBits()[" << TargetName
+                 << "::" << Op.second << "]";
     }
 
     // Emit checks for all required feature groups.
     for (auto &Set : AnyOfFeatureSets) {
-      CondStream.indent(8) << "(";
+      CondStream << CondSep << "(";
       for (auto &Op : Set) {
         bool IsLast = &Op == &*Set.rbegin();
         StringRef Not = Op.first ? "!" : "";
@@ -713,41 +722,43 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
         if (!IsLast)
           CondStream << " || ";
       }
-      CondStream << ") &&\n";
+      CondStream << ")";
     }
 
     // Start Source Inst operands validation.
     unsigned OpNo = 0;
     for (const auto &SourceOperand : Source.Operands) {
-      if (SourceOperandMap[OpNo].TiedOpIdx != -1) {
-        if (Source.Operands[OpNo].Rec->isSubClassOf("RegisterClass"))
-          CondStream.indent(8)
-              << "(MI.getOperand(" << OpNo << ").isReg()) && (MI.getOperand("
-              << SourceOperandMap[OpNo].TiedOpIdx << ").isReg()) &&\n"
-              << indent(8) << "(MI.getOperand(" << OpNo
-              << ").getReg() ==  MI.getOperand("
-              << SourceOperandMap[OpNo].TiedOpIdx << ").getReg()) &&\n";
-        else
-          PrintFatalError("Unexpected tied operand types!");
-      }
       for (unsigned SubOp = 0; SubOp != SourceOperand.MINumOperands; ++SubOp) {
         // Check for fixed immediates\registers in the source instruction.
         switch (SourceOperandMap[OpNo].Kind) {
         case OpData::Operand:
+          if (SourceOperandMap[OpNo].OpInfo.TiedOpIdx != -1) {
+            if (Source.Operands[OpNo].Rec->isSubClassOf("RegisterClass"))
+              CondStream << CondSep << "MI.getOperand(" << OpNo
+                         << ").isReg() && MI.getOperand("
+                         << SourceOperandMap[OpNo].OpInfo.TiedOpIdx
+                         << ").isReg()" << CondSep << "(MI.getOperand(" << OpNo
+                         << ").getReg() == MI.getOperand("
+                         << SourceOperandMap[OpNo].OpInfo.TiedOpIdx
+                         << ").getReg())";
+            else
+              PrintFatalError("Unexpected tied operand types!");
+          }
+
           // We don't need to do anything for source instruction operand checks.
           break;
         case OpData::Imm:
-          CondStream.indent(8)
-              << "(MI.getOperand(" << OpNo << ").isImm()) &&\n"
-              << "      (MI.getOperand(" << OpNo
-              << ").getImm() == " << SourceOperandMap[OpNo].ImmVal << ") &&\n";
+          CondStream << CondSep << "MI.getOperand(" << OpNo << ").isImm()"
+                     << CondSep << "(MI.getOperand(" << OpNo
+                     << ").getImm() == " << SourceOperandMap[OpNo].ImmVal
+                     << ")";
           break;
         case OpData::Reg: {
           const Record *Reg = SourceOperandMap[OpNo].RegRec;
-          CondStream.indent(8) << "(MI.getOperand(" << OpNo << ").isReg()) &&\n"
-                               << indent(8) << "(MI.getOperand(" << OpNo
-                               << ").getReg() == " << TargetName
-                               << "::" << Reg->getName() << ") &&\n";
+          CondStream << CondSep << "MI.getOperand(" << OpNo << ").isReg()"
+                     << CondSep << "(MI.getOperand(" << OpNo
+                     << ").getReg() == " << TargetName << "::" << Reg->getName()
+                     << ")";
           break;
         }
         }
@@ -771,27 +782,27 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
 
         switch (DestOperandMap[OpNo].Kind) {
         case OpData::Operand: {
-          unsigned OpIdx = DestOperandMap[OpNo].OpNo;
+          unsigned OpIdx = DestOperandMap[OpNo].OpInfo.Idx;
+          const Record *DagRec = DestOperandMap[OpNo].OpInfo.DagRec;
           // Check that the operand in the Source instruction fits
           // the type for the Dest instruction.
-          if (DestRec->isSubClassOf("RegisterClass") ||
-              DestRec->isSubClassOf("RegisterOperand")) {
-            auto *ClassRec = DestRec->isSubClassOf("RegisterClass")
-                                 ? DestRec
-                                 : DestRec->getValueAsDef("RegClass");
+          if (DagRec->isSubClassOf("RegisterClass") ||
+              DagRec->isSubClassOf("RegisterOperand")) {
+            auto *ClassRec = DagRec->isSubClassOf("RegisterClass")
+                                 ? DagRec
+                                 : DagRec->getValueAsDef("RegClass");
             // This is a register operand. Check the register class.
             // Don't check register class if this is a tied operand, it was done
-            // for the operand its tied to.
+            // for the operand it's tied to.
             if (DestOperand.getTiedRegister() == -1) {
-              CondStream.indent(8) << "MI.getOperand(" << OpIdx << ").isReg()";
+              CondStream << CondSep << "MI.getOperand(" << OpIdx << ").isReg()";
               if (EType == EmitterType::CheckCompress)
                 CondStream << " && MI.getOperand(" << OpIdx
                            << ").getReg().isPhysical()";
-              CondStream << " &&\n"
-                         << indent(8) << TargetName << "MCRegisterClasses["
+              CondStream << CondSep << TargetName << "MCRegisterClasses["
                          << TargetName << "::" << ClassRec->getName()
                          << "RegClassID].contains(MI.getOperand(" << OpIdx
-                         << ").getReg()) &&\n";
+                         << ").getReg())";
             }
 
             if (CompressOrUncompress)
@@ -801,19 +812,35 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
             // Handling immediate operands.
             if (CompressOrUncompress) {
               unsigned Entry = getPredicates(MCOpPredicateMap, MCOpPredicates,
-                                             DestRec, "MCOperandPredicate");
-              CondStream.indent(8) << ValidatorName << "("
-                                   << "MI.getOperand(" << OpIdx << "), STI, "
-                                   << Entry << ") &&\n";
+                                             DagRec, "MCOperandPredicate");
+              CondStream << CondSep << ValidatorName << "("
+                         << "MI.getOperand(" << OpIdx << "), STI, " << Entry
+                         << " /* " << DagRec->getName() << " */)";
+              // Also check DestRec if different than DagRec.
+              if (DagRec != DestRec) {
+                Entry = getPredicates(MCOpPredicateMap, MCOpPredicates, DestRec,
+                                      "MCOperandPredicate");
+                CondStream << CondSep << ValidatorName << "("
+                           << "MI.getOperand(" << OpIdx << "), STI, " << Entry
+                           << " /* " << DestRec->getName() << " */)";
+              }
             } else {
               unsigned Entry =
-                  getPredicates(ImmLeafPredicateMap, ImmLeafPredicates, DestRec,
+                  getPredicates(ImmLeafPredicateMap, ImmLeafPredicates, DagRec,
                                 "ImmediateCode");
-              CondStream.indent(8)
-                  << "MI.getOperand(" << OpIdx << ").isImm() &&\n";
-              CondStream.indent(8) << TargetName << "ValidateMachineOperand("
-                                   << "MI.getOperand(" << OpIdx << "), &STI, "
-                                   << Entry << ") &&\n";
+              CondStream << CondSep << "MI.getOperand(" << OpIdx << ").isImm()";
+              CondStream << CondSep << TargetName << "ValidateMachineOperand("
+                         << "MI.getOperand(" << OpIdx << "), &STI, " << Entry
+                         << " /* " << DagRec->getName() << " */)";
+              if (DagRec != DestRec) {
+                Entry = getPredicates(ImmLeafPredicateMap, ImmLeafPredicates,
+                                      DestRec, "ImmediateCode");
+                CondStream << CondSep << "MI.getOperand(" << OpIdx
+                           << ").isImm()";
+                CondStream << CondSep << TargetName << "ValidateMachineOperand("
+                           << "MI.getOperand(" << OpIdx << "), &STI, " << Entry
+                           << " /* " << DestRec->getName() << " */)";
+              }
             }
             if (CompressOrUncompress)
               CodeStream.indent(6)
@@ -825,19 +852,18 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
           if (CompressOrUncompress) {
             unsigned Entry = getPredicates(MCOpPredicateMap, MCOpPredicates,
                                            DestRec, "MCOperandPredicate");
-            CondStream.indent(8)
-                << ValidatorName << "("
-                << "MCOperand::createImm(" << DestOperandMap[OpNo].Imm
-                << "), STI, " << Entry << ") &&\n";
+            CondStream << CondSep << ValidatorName << "("
+                       << "MCOperand::createImm(" << DestOperandMap[OpNo].ImmVal
+                       << "), STI, " << Entry << " /* " << DestRec->getName()
+                       << " */)";
           } else {
             unsigned Entry =
                 getPredicates(ImmLeafPredicateMap, ImmLeafPredicates, DestRec,
                               "ImmediateCode");
-            CondStream.indent(8)
-                << TargetName
-                << "ValidateMachineOperand(MachineOperand::CreateImm("
-                << DestOperandMap[OpNo].ImmVal << "), &STI, " << Entry
-                << ") &&\n";
+            CondStream << CondSep << TargetName
+                       << "ValidateMachineOperand(MachineOperand::CreateImm("
+                       << DestOperandMap[OpNo].ImmVal << "), &STI, " << Entry
+                       << " /* " << DestRec->getName() << " */)";
           }
           if (CompressOrUncompress)
             CodeStream.indent(6) << "OutInst.addOperand(MCOperand::createImm("
@@ -861,9 +887,10 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
     mergeCondAndCode(CaseStream, CondString, CodeString);
     PrevOp = CurOp;
   }
-  Func << CaseString << "\n";
+  Func << CaseString;
+  Func.indent(4) << "break;\n";
   // Close brace for the last case.
-  Func.indent(4) << "} // case " << CurOp << "\n";
+  Func.indent(2) << "} // case " << CurOp << "\n";
   Func.indent(2) << "} // switch\n";
   Func.indent(2) << "return false;\n}\n";
 
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index f028fcd..fa38d01 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -248,9 +248,7 @@ void InstrInfoEmitter::emitOperandNameMappings(
   /// scan of the instructions below.
 
   // Map of operand names to their ID.
-  std::map<StringRef, unsigned> OperandNameToID;
-  // Map from operand name enum value -> ID.
-  std::vector<unsigned> OperandEnumToID;
+  MapVector<StringRef, unsigned> OperandNameToID;
 
   /// The keys of this map is a map which have OpName ID values as their keys
   /// and instruction operand indices as their values. The values of this map
@@ -278,16 +276,13 @@ void InstrInfoEmitter::emitOperandNameMappings(
   }
 
   const size_t NumOperandNames = OperandNameToID.size();
-  OperandEnumToID.reserve(NumOperandNames);
-  for (const auto &Op : OperandNameToID)
-    OperandEnumToID.push_back(Op.second);
 
   OS << "#ifdef GET_INSTRINFO_OPERAND_ENUM\n";
   OS << "#undef GET_INSTRINFO_OPERAND_ENUM\n";
   OS << "namespace llvm::" << Namespace << " {\n";
   OS << "enum class OpName {\n";
-  for (const auto &[I, Op] : enumerate(OperandNameToID))
-    OS << "  " << Op.first << " = " << I << ",\n";
+  for (const auto &[Op, I] : OperandNameToID)
+    OS << "  " << Op << " = " << I << ",\n";
   OS << "  NUM_OPERAND_NAMES = " << NumOperandNames << ",\n";
   OS << "}; // enum class OpName\n\n";
   OS << "LLVM_READONLY\n";
@@ -312,7 +307,7 @@ void InstrInfoEmitter::emitOperandNameMappings(
 
       // Emit a row of the OperandMap table.
       OS << "    {";
-      for (unsigned ID : OperandEnumToID) {
+      for (unsigned ID = 0; ID < NumOperandNames; ++ID) {
         auto Iter = OpList.find(ID);
         OS << (Iter != OpList.end() ? (int)Iter->second : -1) << ", ";
       }
diff --git a/llvm/utils/gn/build/write_vcsrevision.py b/llvm/utils/gn/build/write_vcsrevision.py
index afd6aae..3a627ee 100755
--- a/llvm/utils/gn/build/write_vcsrevision.py
+++ b/llvm/utils/gn/build/write_vcsrevision.py
@@ -6,22 +6,13 @@ import argparse
 import os
 import subprocess
 import sys
+import shutil
 
 
 THIS_DIR = os.path.abspath(os.path.dirname(__file__))
 LLVM_DIR = os.path.dirname(os.path.dirname(os.path.dirname(THIS_DIR)))
 
 
-def which(program):
-    # distutils.spawn.which() doesn't find .bat files,
-    # https://bugs.python.org/issue2200
-    for path in os.environ["PATH"].split(os.pathsep):
-        candidate = os.path.join(path, program)
-        if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
-            return candidate
-    return None
-
-
 def main():
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
@@ -46,11 +37,11 @@ def main():
 
     vcsrevision_contents = ""
     if args.write_git_rev:
-        git, use_shell = which("git"), False
+        git, use_shell = shutil.which("git"), False
         if not git:
-            git = which("git.exe")
+            git = shutil.which("git.exe")
         if not git:
-            git, use_shell = which("git.bat"), True
+            git, use_shell = shutil.which("git.bat"), True
         git_dir = (
             subprocess.check_output(
                 [git, "rev-parse", "--git-dir"], cwd=LLVM_DIR, shell=use_shell
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
index 8d19295..defa12c 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
@@ -30,6 +30,7 @@ source_set("tweaks") {
     "MemberwiseConstructor.cpp",
     "ObjCLocalizeStringLiteral.cpp",
     "ObjCMemberwiseInitializer.cpp",
+    "OverridePureVirtuals.cpp",
     "PopulateSwitch.cpp",
     "RawStringLiteral.cpp",
     "RemoveUsingNamespace.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
index 7deefe9..ad32aa9 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
@@ -144,6 +144,7 @@ unittest("ClangdTests") {
     "tweaks/MemberwiseConstructorTests.cpp",
     "tweaks/ObjCLocalizeStringLiteralTests.cpp",
     "tweaks/ObjCMemberwiseInitializerTests.cpp",
+    "tweaks/OverridePureVirtualsTests.cpp",
     "tweaks/PopulateSwitchTests.cpp",
     "tweaks/RawStringLiteralTests.cpp",
     "tweaks/RemoveUsingNamespaceTests.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 1f83a7c..d270686 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1038,6 +1038,7 @@ if (current_toolchain == default_toolchain) {
       "__format/enable_insertable.h",
       "__format/escaped_output_table.h",
       "__format/extended_grapheme_cluster_table.h",
+      "__format/fmt_pair_like.h",
       "__format/format_arg.h",
       "__format/format_arg_store.h",
       "__format/format_args.h",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Linux/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Linux/BUILD.gn
index 978f186..dcac0ca 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Linux/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Process/Linux/BUILD.gn
@@ -25,6 +25,7 @@ static_library("Linux") {
     "NativeRegisterContextLinux.cpp",
     "NativeRegisterContextLinux_arm.cpp",
     "NativeRegisterContextLinux_arm64.cpp",
+    "NativeRegisterContextLinux_arm64dbreg.cpp",
     "NativeRegisterContextLinux_loongarch64.cpp",
     "NativeRegisterContextLinux_ppc64le.cpp",
     "NativeRegisterContextLinux_riscv64.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index b8f3b4f..499ded9 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -296,6 +296,7 @@ write_cmake_config("llvm-config") {
   input = "llvm-config.h.cmake"
   output = "$target_gen_dir/llvm-config.h"
   values = [
+    "LLVM_ENABLE_PROFCHECK=",
     "LLVM_BUILD_LLVM_DYLIB=",
     "LLVM_BUILD_SHARED_LIBS=",
     "LLVM_ENABLE_LLVM_C_EXPORT_ANNOTATIONS=",
diff --git a/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn
index 5c96bd8..eac2cd4 100644
--- a/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn
@@ -57,13 +57,7 @@ static_library("MC") {
     "MCSPIRVStreamer.cpp",
     "MCSchedule.cpp",
     "MCSection.cpp",
-    "MCSectionCOFF.cpp",
-    "MCSectionDXContainer.cpp",
-    "MCSectionELF.cpp",
-    "MCSectionGOFF.cpp",
     "MCSectionMachO.cpp",
-    "MCSectionWasm.cpp",
-    "MCSectionXCOFF.cpp",
     "MCStreamer.cpp",
     "MCSubtargetInfo.cpp",
     "MCSymbol.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/ObjCARC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/ObjCARC/BUILD.gn
index e7b2084..d4ad915 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/ObjCARC/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/ObjCARC/BUILD.gn
@@ -9,7 +9,6 @@ static_library("ObjCARC") {
   sources = [
     "DependencyAnalysis.cpp",
     "ObjCARC.cpp",
-    "ObjCARCAPElim.cpp",
     "ObjCARCContract.cpp",
     "ObjCARCExpand.cpp",
     "ObjCARCOpts.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
index 7ed0d3c..08cddc1 100644
--- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn
@@ -64,6 +64,7 @@ write_lit_config("lit_site_cfg") {
     "LLVM_APPEND_VC_REV=0",
     "LLVM_ENABLE_FFI=0",
     "LLVM_ENABLE_HTTPLIB=0",
+    "LLVM_ENABLE_PROFCHECK=0",
     "LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS=0",
     "LLVM_FORCE_VC_REVISION=",
     "LLVM_HAS_LOGF128=0",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index 3aaec30..bcb8535 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -34,6 +34,7 @@ unittest("SupportTests") {
     "DJBTest.cpp",
     "DataExtractorTest.cpp",
     "DebugCounterTest.cpp",
+    "DebugLogTest.cpp",
     "DebugTest.cpp",
     "DivisionByConstantTest.cpp",
     "ELFAttributeParserTest.cpp",
diff --git a/llvm/utils/lit/lit/formats/base.py b/llvm/utils/lit/lit/formats/base.py
index 27f7c7e..db5c1c3 100644
--- a/llvm/utils/lit/lit/formats/base.py
+++ b/llvm/utils/lit/lit/formats/base.py
@@ -34,8 +34,7 @@ class FileBasedTest(TestFormat):
         if filename.startswith(".") or filename in localConfig.excludes:
             return
 
-        base, ext = os.path.splitext(filename)
-        if ext in localConfig.suffixes:
+        if any(filename.endswith(suffix) for suffix in localConfig.suffixes):
             yield lit.Test.Test(testSuite, path_in_suite, localConfig)
 
     def getTestsInDirectory(self, testSuite, path_in_suite, litConfig, localConfig):
diff --git a/llvm/utils/lldbDataFormatters.py b/llvm/utils/lldbDataFormatters.py
index c5cd627..7fbeabe6 100644
--- a/llvm/utils/lldbDataFormatters.py
+++ b/llvm/utils/lldbDataFormatters.py
@@ -94,6 +94,11 @@ def __lldb_init_module(debugger, internal_dict):
         f"-l {__name__}.ExpectedSynthetic "
         '-x "^llvm::Expected<.+>$"'
     )
+    debugger.HandleCommand(
+        "type summary add -w llvm "
+        f"-F {__name__}.SmallBitVectorSummary "
+        "llvm::SmallBitVector"
+    )
 
 
 # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl
@@ -448,3 +453,28 @@ class ExpectedSynthetic:
         if idx == 0:
             return self.stored_value
         return lldb.SBValue()
+
+
+def SmallBitVectorSummary(valobj, _):
+    underlyingValue = valobj.GetChildMemberWithName("X").unsigned
+    numBaseBits = valobj.target.addr_size * 8
+    smallNumRawBits = numBaseBits - 1
+    smallNumSizeBits = None
+    if numBaseBits == 32:
+        smallNumSizeBits = 5
+    elif numBaseBits == 64:
+        smallNumSizeBits = 6
+    else:
+        smallNumSizeBits = smallNumRawBits
+    smallNumDataBits = smallNumRawBits - smallNumSizeBits
+
+    # If our underlying value is not small, print we can not dump large values.
+    isSmallMask = 1
+    if underlyingValue & isSmallMask == 0:
+        return "<can not read large SmallBitVector>"
+
+    smallRawBits = underlyingValue >> 1
+    smallSize = smallRawBits >> smallNumDataBits
+    bits = smallRawBits & ((1 << (smallSize + 1)) - 1)
+    # format `bits` in binary (b), with 0 padding, of width `smallSize`, and left aligned (>)
+    return f"[{bits:0>{smallSize}b}]"